From c757249af152c59fd74b85e52e8c090acb33d9c0 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:40 -0700
Subject: [PATCH] per-task-delay-accounting: taskstats interface

Create a "taskstats" interface based on generic netlink (NETLINK_GENERIC
family), for getting statistics of tasks and thread groups during their
lifetime and when they exit.  The interface is intended for use by multiple
accounting packages though it is being created in the context of delay
accounting.

This patch creates the interface without populating the fields of the data
that is sent to the user in response to a command or upon the exit of a task.
Each accounting package interested in using taskstats has to provide an
additional patch to add its stats to the common structure.

[akpm@osdl.org: cleanups, Kconfig fix]
Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 336 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 336 insertions(+)
 create mode 100644 kernel/taskstats.c

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 00000000000..82ec9137d90
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,336 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+static DEFINE_MUTEX(taskstats_exit_mutex);
+
+static struct genl_family family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= TASKSTATS_GENL_NAME,
+	.version	= TASKSTATS_GENL_VERSION,
+	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+};
+
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+			void **replyp, size_t size)
+{
+	struct sk_buff *skb;
+	void *reply;
+
+	/*
+	 * If new attributes are added, please revisit this allocation
+	 */
+	skb = nlmsg_new(size);
+	if (!skb)
+		return -ENOMEM;
+
+	if (!info) {
+		int seq = get_cpu_var(taskstats_seqnum)++;
+		put_cpu_var(taskstats_seqnum);
+
+		reply = genlmsg_put(skb, 0, seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	} else
+		reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	if (reply == NULL) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	*skbp = skb;
+	*replyp = reply;
+	return 0;
+}
+
+static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	void *reply;
+	int rc;
+
+	reply = genlmsg_data(genlhdr);
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	if (event == TASKSTATS_MSG_MULTICAST)
+		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
+	return genlmsg_unicast(skb, pid);
+}
+
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk = pidtsk;
+
+	if (!pidtsk) {
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_pid(pid);
+		if (!tsk) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+		get_task_struct(tsk);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(tsk);
+
+	/*
+	 * Each accounting subsystem adds calls to its functions to
+	 * fill in relevant parts of struct taskstsats as follows
+	 *
+	 *	rc = per-task-foo(stats, tsk);
+	 *	if (rc)
+	 *		goto err;
+	 */
+
+err:
+	put_task_struct(tsk);
+	return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk, *first;
+
+	first = tgidtsk;
+	read_lock(&tasklist_lock);
+	if (!first) {
+		first = find_task_by_pid(tgid);
+		if (!first) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+	}
+	tsk = first;
+	do {
+		/*
+		 * Each accounting subsystem adds calls its functions to
+		 * fill in relevant parts of struct taskstsats as follows
+		 *
+		 *	rc = per-task-foo(stats, tsk);
+		 *	if (rc)
+		 *		break;
+		 */
+
+	} while_each_thread(first, tsk);
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Accounting subsytems can also add calls here if they don't
+	 * wish to aggregate statistics for per-tgid stats
+	 */
+
+	return rc;
+}
+
+static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+{
+	int rc = 0;
+	struct sk_buff *rep_skb;
+	struct taskstats stats;
+	void *reply;
+	size_t size;
+	struct nlattr *na;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	memset(&stats, 0, sizeof(stats));
+	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		return rc;
+
+	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+		rc = fill_pid(pid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+		rc = fill_tgid(tgid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	nla_nest_end(rep_skb, na);
+
+	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+
+nla_put_failure:
+	return genlmsg_cancel(rep_skb, reply);
+err:
+	nlmsg_free(rep_skb);
+	return rc;
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+			struct taskstats *tgidstats)
+{
+	int rc;
+	struct sk_buff *rep_skb;
+	void *reply;
+	size_t size;
+	int is_thread_group;
+	struct nlattr *na;
+
+	if (!family_registered || !tidstats)
+		return;
+
+	mutex_lock(&taskstats_exit_mutex);
+
+	is_thread_group = !thread_group_empty(tsk);
+	rc = 0;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	if (is_thread_group)
+		size = 2 * size;	/* PID + STATS + TGID + STATS */
+
+	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		goto ret;
+
+	rc = fill_pid(tsk->pid, tsk, tidstats);
+	if (rc < 0)
+		goto err_skb;
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tidstats);
+	nla_nest_end(rep_skb, na);
+
+	if (!is_thread_group || !tgidstats) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	rc = fill_tgid(tsk->pid, tsk, tgidstats);
+	/*
+	 * If fill_tgid() failed then one probable reason could be that the
+	 * thread group leader has exited. fill_tgid() will fail, send out
+	 * the pid statistics collected earlier.
+	 */
+	if (rc < 0) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tgidstats);
+	nla_nest_end(rep_skb, na);
+
+	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	goto ret;
+
+nla_put_failure:
+	genlmsg_cancel(rep_skb, reply);
+	goto ret;
+err_skb:
+	nlmsg_free(rep_skb);
+ret:
+	mutex_unlock(&taskstats_exit_mutex);
+	return;
+}
+
+static struct genl_ops taskstats_ops = {
+	.cmd		= TASKSTATS_CMD_GET,
+	.doit		= taskstats_send_stats,
+	.policy		= taskstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+	taskstats_cache = kmem_cache_create("taskstats_cache",
+						sizeof(struct taskstats),
+						0, SLAB_PANIC, NULL, NULL);
+}
+
+static int __init taskstats_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&family);
+	if (rc)
+		return rc;
+
+	rc = genl_register_ops(&family, &taskstats_ops);
+	if (rc < 0)
+		goto err;
+
+	family_registered = 1;
+	return 0;
+err:
+	genl_unregister_family(&family);
+	return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
-- 
cgit v1.2.3-70-g09d2


From 6f44993fe1d7b2b097f6ac60cd5835c6f5ca0874 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:41 -0700
Subject: [PATCH] per-task-delay-accounting: delay accounting usage of
 taskstats interface

Usage of taskstats interface by delay accounting.

Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/delayacct.h      | 15 ++++++++++
 include/linux/sched.h          |  1 +
 include/linux/taskstats.h      | 55 ++++++++++++++++++++++++++++++++++++-
 include/linux/taskstats_kern.h |  1 +
 init/Kconfig                   |  1 +
 kernel/delayacct.c             | 62 +++++++++++++++++++++++++++++++++++++++++-
 kernel/taskstats.c             | 16 +++++++----
 7 files changed, 144 insertions(+), 7 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 0ecbf9aad8e..d955078a144 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -18,6 +18,7 @@
 #define _LINUX_DELAYACCT_H
 
 #include <linux/sched.h>
+#include <linux/taskstats_kern.h>
 
 /*
  * Per-task flags relevant to delay accounting
@@ -35,6 +36,7 @@ extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
 extern void __delayacct_blkio_start(void);
 extern void __delayacct_blkio_end(void);
+extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
 
 static inline void delayacct_set_flag(int flag)
 {
@@ -74,6 +76,16 @@ static inline void delayacct_blkio_end(void)
 		__delayacct_blkio_end();
 }
 
+static inline int delayacct_add_tsk(struct taskstats *d,
+					struct task_struct *tsk)
+{
+	if (likely(!delayacct_on))
+		return -EINVAL;
+	if (!tsk->delays)
+		return 0;
+	return __delayacct_add_tsk(d, tsk);
+}
+
 #else
 static inline void delayacct_set_flag(int flag)
 {}
@@ -89,6 +101,9 @@ static inline void delayacct_blkio_start(void)
 {}
 static inline void delayacct_blkio_end(void)
 {}
+static inline int delayacct_add_tsk(struct taskstats *d,
+					struct task_struct *tsk)
+{ return 0; }
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f751062d89a..3c5610ca0c9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -990,6 +990,7 @@ struct task_struct {
 	 */
 	struct pipe_inode_info *splice_pipe;
 #ifdef	CONFIG_TASK_DELAY_ACCT
+	spinlock_t delays_lock;
 	struct task_delay_info *delays;
 #endif
 };
diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index 51f62759bea..c6aeca32348 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -34,7 +34,60 @@
 struct taskstats {
 
 	/* Version 1 */
-	__u64	version;
+	__u16	version;
+	__u16	padding[3];	/* Userspace should not interpret the padding
+				 * field which can be replaced by useful
+				 * fields if struct taskstats is extended.
+				 */
+
+	/* Delay accounting fields start
+	 *
+	 * All values, until comment "Delay accounting fields end" are
+	 * available only if delay accounting is enabled, even though the last
+	 * few fields are not delays
+	 *
+	 * xxx_count is the number of delay values recorded
+	 * xxx_delay_total is the corresponding cumulative delay in nanoseconds
+	 *
+	 * xxx_delay_total wraps around to zero on overflow
+	 * xxx_count incremented regardless of overflow
+	 */
+
+	/* Delay waiting for cpu, while runnable
+	 * count, delay_total NOT updated atomically
+	 */
+	__u64	cpu_count;
+	__u64	cpu_delay_total;
+
+	/* Following four fields atomically updated using task->delays->lock */
+
+	/* Delay waiting for synchronous block I/O to complete
+	 * does not account for delays in I/O submission
+	 */
+	__u64	blkio_count;
+	__u64	blkio_delay_total;
+
+	/* Delay waiting for page fault I/O (swap in only) */
+	__u64	swapin_count;
+	__u64	swapin_delay_total;
+
+	/* cpu "wall-clock" running time
+	 * On some architectures, value will adjust for cpu time stolen
+	 * from the kernel in involuntary waits due to virtualization.
+	 * Value is cumulative, in nanoseconds, without a corresponding count
+	 * and wraps around to zero silently on overflow
+	 */
+	__u64	cpu_run_real_total;
+
+	/* cpu "virtual" running time
+	 * Uses time intervals seen by the kernel i.e. no adjustment
+	 * for kernel's involuntary waits due to virtualization.
+	 * Value is cumulative, in nanoseconds, without a corresponding count
+	 * and wraps around to zero silently on overflow
+	 */
+	__u64	cpu_run_virtual_total;
+	/* Delay accounting fields end */
+	/* version 1 ends here */
 };
 
 
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index bd0ecb969c2..fc9da2e2644 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -17,6 +17,7 @@ enum {
 
 #ifdef CONFIG_TASKSTATS
 extern kmem_cache_t *taskstats_cache;
+extern struct mutex taskstats_exit_mutex;
 
 static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
 					struct taskstats **ptgidstats)
diff --git a/init/Kconfig b/init/Kconfig
index 56a7093b4e4..a099fc6526d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -173,6 +173,7 @@ config TASKSTATS
 
 config TASK_DELAY_ACCT
 	bool "Enable per-task delay accounting (EXPERIMENTAL)"
+	depends on TASKSTATS
 	help
 	  Collect information on time spent by a task waiting for system
 	  resources like cpu, synchronous block I/O completion and swapping
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 3546b0800f9..1be274a462c 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -41,6 +41,10 @@ void delayacct_init(void)
 
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
+	spin_lock_init(&tsk->delays_lock);
+	/* No need to acquire tsk->delays_lock for allocation here unless
+	   __delayacct_tsk_init called after tsk is attached to tasklist
+	*/
 	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
 	if (tsk->delays)
 		spin_lock_init(&tsk->delays->lock);
@@ -48,8 +52,11 @@ void __delayacct_tsk_init(struct task_struct *tsk)
 
 void __delayacct_tsk_exit(struct task_struct *tsk)
 {
-	kmem_cache_free(delayacct_cache, tsk->delays);
+	struct task_delay_info *delays = tsk->delays;
+	spin_lock(&tsk->delays_lock);
 	tsk->delays = NULL;
+	spin_unlock(&tsk->delays_lock);
+	kmem_cache_free(delayacct_cache, delays);
 }
 
 /*
@@ -104,3 +111,56 @@ void __delayacct_blkio_end(void)
 			&current->delays->blkio_delay,
 			&current->delays->blkio_count);
 }
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+	s64 tmp;
+	struct timespec ts;
+	unsigned long t1,t2,t3;
+
+	spin_lock(&tsk->delays_lock);
+
+	/* Though tsk->delays accessed later, early exit avoids
+	 * unnecessary returning of other data
+	 */
+	if (!tsk->delays)
+		goto done;
+
+	tmp = (s64)d->cpu_run_real_total;
+	cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+	tmp += timespec_to_ns(&ts);
+	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+	/*
+	 * No locking available for sched_info (and too expensive to add one)
+	 * Mitigate by taking snapshot of values
+	 */
+	t1 = tsk->sched_info.pcnt;
+	t2 = tsk->sched_info.run_delay;
+	t3 = tsk->sched_info.cpu_time;
+
+	d->cpu_count += t1;
+
+	jiffies_to_timespec(t2, &ts);
+	tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
+	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+	tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+	d->cpu_run_virtual_total =
+		(tmp < (s64)d->cpu_run_virtual_total) ?	0 : tmp;
+
+	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+	spin_lock(&tsk->delays->lock);
+	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+	d->blkio_count += tsk->delays->blkio_count;
+	d->swapin_count += tsk->delays->swapin_count;
+	spin_unlock(&tsk->delays->lock);
+
+done:
+	spin_unlock(&tsk->delays_lock);
+	return 0;
+}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 82ec9137d90..ea9506de3b8 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,13 +18,13 @@
 
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
-static DEFINE_MUTEX(taskstats_exit_mutex);
 
 static struct genl_family family = {
 	.id		= GENL_ID_GENERATE,
@@ -120,7 +120,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 *		goto err;
 	 */
 
-err:
+	rc = delayacct_add_tsk(stats, tsk);
+	stats->version = TASKSTATS_VERSION;
+
+	/* Define err: label here if needed */
 	put_task_struct(tsk);
 	return rc;
 
@@ -152,8 +155,14 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		 *		break;
 		 */
 
+		rc = delayacct_add_tsk(stats, tsk);
+		if (rc)
+			break;
+
 	} while_each_thread(first, tsk);
 	read_unlock(&tasklist_lock);
+	stats->version = TASKSTATS_VERSION;
+
 
 	/*
 	 * Accounting subsytems can also add calls here if they don't
@@ -233,8 +242,6 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	if (!family_registered || !tidstats)
 		return;
 
-	mutex_lock(&taskstats_exit_mutex);
-
 	is_thread_group = !thread_group_empty(tsk);
 	rc = 0;
 
@@ -292,7 +299,6 @@ nla_put_failure:
 err_skb:
 	nlmsg_free(rep_skb);
 ret:
-	mutex_unlock(&taskstats_exit_mutex);
 	return;
 }
 
-- 
cgit v1.2.3-70-g09d2


From ad4ecbcba72855a2b5319b96e2a3a65ed1ca3bfd Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:44 -0700
Subject: [PATCH] delay accounting taskstats interface send tgid once

Send per-tgid data only once during exit of a thread group instead of once
with each member thread exit.

Currently, when a thread exits, besides its per-tid data, the per-tgid data
of its thread group is also sent out, if its thread group is non-empty.
The per-tgid data sent consists of the sum of per-tid stats for all
*remaining* threads of the thread group.

This patch modifies this sending in two ways:

- the per-tgid data is sent only when the last thread of a thread group
  exits.  This cuts down heavily on the overhead of sending/receiving
  per-tgid data, especially when other exploiters of the taskstats
  interface aren't interested in per-tgid stats

- the semantics of the per-tgid data sent are changed.  Instead of being
  the sum of per-tid data for remaining threads, the value now sent is the
  true total accumalated statistics for all threads that are/were part of
  the thread group.

The patch also addresses a minor issue where failure of one accounting
subsystem to fill in the taskstats structure was causing the send of
taskstats to not be sent at all.

The patch has been tested for stability and run cerberus for over 4 hours
on an SMP.

[akpm@osdl.org: bugfixes]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/accounting/delay-accounting.txt | 13 ++--
 Documentation/accounting/taskstats.txt        | 33 ++++-----
 MAINTAINERS                                   | 12 ++++
 include/linux/sched.h                         |  4 ++
 include/linux/taskstats_kern.h                | 71 ++++++++++++++-----
 kernel/exit.c                                 |  8 +--
 kernel/fork.c                                 |  4 ++
 kernel/taskstats.c                            | 98 ++++++++++++++++++---------
 8 files changed, 162 insertions(+), 81 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt
index f3dc0ca04fa..be215e58423 100644
--- a/Documentation/accounting/delay-accounting.txt
+++ b/Documentation/accounting/delay-accounting.txt
@@ -48,9 +48,10 @@ counter (say cpu_delay_total) for a task will give the delay
 experienced by the task waiting for the corresponding resource
 in that interval.
 
-When a task exits, records containing the per-task and per-process statistics
-are sent to userspace without requiring a command. More details are given in
-the taskstats interface description.
+When a task exits, records containing the per-task statistics
+are sent to userspace without requiring a command. If it is the last exiting
+task of a thread group, the per-tgid statistics are also sent. More details
+are given in the taskstats interface description.
 
 The getdelays.c userspace utility in this directory allows simple commands to
 be run and the corresponding delay statistics to be displayed. It also serves
@@ -107,9 +108,3 @@ IO	count	delay total
 	0	0
 MEM	count	delay total
 	0	0
-
-
-
-
-
-
diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
index acc6b4f37fc..efd8f605bcd 100644
--- a/Documentation/accounting/taskstats.txt
+++ b/Documentation/accounting/taskstats.txt
@@ -32,12 +32,11 @@ The response contains statistics for a task (if pid is specified) or the sum of
 statistics for all tasks of the process (if tgid is specified).
 
 To obtain statistics for tasks which are exiting, userspace opens a multicast
-netlink socket. Each time a task exits, two records are sent by the kernel to
-each listener on the multicast socket. The first the per-pid task's statistics
-and the second is the sum for all tasks of the process to which the task
-belongs (the task does not need to be the thread group leader). The need for
-per-tgid stats to be sent for each exiting task is explained in the per-tgid
-stats section below.
+netlink socket. Each time a task exits, its per-pid statistics is always sent
+by the kernel to each listener on the multicast socket. In addition, if it is
+the last thread exiting its thread group, an additional record containing the
+per-tgid stats are also sent. The latter contains the sum of per-pid stats for
+all threads in the thread group, both past and present.
 
 getdelays.c is a simple utility demonstrating usage of the taskstats interface
 for reporting delay accounting statistics.
@@ -104,20 +103,14 @@ stats in userspace alone is inefficient and potentially inaccurate (due to lack
 of atomicity).
 
 However, maintaining per-process, in addition to per-task stats, within the
-kernel has space and time overheads. Hence the taskstats implementation
-dynamically sums up the per-task stats for each task belonging to a process
-whenever per-process stats are needed.
-
-Not maintaining per-tgid stats creates a problem when userspace is interested
-in getting these stats when the process dies i.e. the last thread of
-a process exits. It isn't possible to simply return some aggregated per-process
-statistic from the kernel.
-
-The approach taken by taskstats is to return the per-tgid stats *each* time
-a task exits, in addition to the per-pid stats for that task. Userspace can
-maintain task<->process mappings and use them to maintain the per-process stats
-in userspace, updating the aggregate appropriately as the tasks of a process
-exit.
+kernel has space and time overheads. To address this, the taskstats code
+accumalates each exiting task's statistics into a process-wide data structure.
+When the last task of a process exits, the process level data accumalated also
+gets sent to userspace (along with the per-task data).
+
+When a user queries to get per-tgid data, the sum of all other live threads in
+the group is added up and added to the accumalated total for previously exited
+threads of the same thread group.
 
 Extending taskstats
 -------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 0557cfde053..e99028ca2f7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2240,6 +2240,12 @@ M:	tsbogend@alpha.franken.de
 L:	netdev@vger.kernel.org
 S:	Maintained
 
+PER-TASK DELAY ACCOUNTING
+P:	Shailabh Nagar
+M:	nagar@watson.ibm.com
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 PERSONALITY HANDLING
 P:	Christoph Hellwig
 M:	hch@infradead.org
@@ -2767,6 +2773,12 @@ P:	Deepak Saxena
 M:	dsaxena@plexity.net
 S:	Maintained
 
+TASKSTATS STATISTICS INTERFACE
+P:	Shailabh Nagar
+M:	nagar@watson.ibm.com
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 TI PARALLEL LINK CABLE DRIVER
 P:     Romain Lievin
 M:     roms@lpg.ticalc.org
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c5610ca0c9..6afa72e080c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -463,6 +463,10 @@ struct signal_struct {
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct pacct_struct pacct;	/* per-process accounting information */
 #endif
+#ifdef CONFIG_TASKSTATS
+	spinlock_t stats_lock;
+	struct taskstats *stats;
+#endif
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index fc9da2e2644..0ae8f67af1f 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -19,36 +19,75 @@ enum {
 extern kmem_cache_t *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
 
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
-					struct taskstats **ptgidstats)
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
 {
 	*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
-	*ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
 }
 
-static inline void taskstats_exit_free(struct taskstats *tidstats,
-					struct taskstats *tgidstats)
+static inline void taskstats_exit_free(struct taskstats *tidstats)
 {
 	if (tidstats)
 		kmem_cache_free(taskstats_cache, tidstats);
-	if (tgidstats)
-		kmem_cache_free(taskstats_cache, tgidstats);
 }
 
-extern void taskstats_exit_send(struct task_struct *, struct taskstats *,
-				struct taskstats *);
-extern void taskstats_init_early(void);
+static inline void taskstats_tgid_init(struct signal_struct *sig)
+{
+	spin_lock_init(&sig->stats_lock);
+	sig->stats = NULL;
+}
+
+static inline void taskstats_tgid_alloc(struct signal_struct *sig)
+{
+	struct taskstats *stats;
+	unsigned long flags;
+
+	stats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	if (!stats)
+		return;
+
+	spin_lock_irqsave(&sig->stats_lock, flags);
+	if (!sig->stats) {
+		sig->stats = stats;
+		stats = NULL;
+	}
+	spin_unlock_irqrestore(&sig->stats_lock, flags);
+
+	if (stats)
+		kmem_cache_free(taskstats_cache, stats);
+}
 
+static inline void taskstats_tgid_free(struct signal_struct *sig)
+{
+	struct taskstats *stats = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sig->stats_lock, flags);
+	if (sig->stats) {
+		stats = sig->stats;
+		sig->stats = NULL;
+	}
+	spin_unlock_irqrestore(&sig->stats_lock, flags);
+	if (stats)
+		kmem_cache_free(taskstats_cache, stats);
+}
+
+extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int);
+extern void taskstats_init_early(void);
+extern void taskstats_tgid_alloc(struct signal_struct *);
 #else
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
-					struct taskstats **ptgidstats)
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
 {}
-static inline void taskstats_exit_free(struct taskstats *ptidstats,
-					struct taskstats *ptgidstats)
+static inline void taskstats_exit_free(struct taskstats *ptidstats)
 {}
 static inline void taskstats_exit_send(struct task_struct *tsk,
-					struct taskstats *tidstats,
-					struct taskstats *tgidstats)
+				       struct taskstats *tidstats,
+				       int group_dead)
+{}
+static inline void taskstats_tgid_init(struct signal_struct *sig)
+{}
+static inline void taskstats_tgid_alloc(struct signal_struct *sig)
+{}
+static inline void taskstats_tgid_free(struct signal_struct *sig)
 {}
 static inline void taskstats_init_early(void)
 {}
diff --git a/kernel/exit.c b/kernel/exit.c
index 9852ed8c298..67c1e9a4f81 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -845,7 +845,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
-	struct taskstats *tidstats, *tgidstats;
+	struct taskstats *tidstats;
 	int group_dead;
 
 	profile_task_exit(tsk);
@@ -884,7 +884,7 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats, &tgidstats);
+	taskstats_exit_alloc(&tidstats);
 
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
@@ -905,8 +905,8 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, tgidstats);
-	taskstats_exit_free(tidstats, tgidstats);
+	taskstats_exit_send(tsk, tidstats, group_dead);
+	taskstats_exit_free(tidstats);
 	delayacct_tsk_exit(tsk);
 
 	exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 451cfd35bf2..1b0f7b1e088 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -44,6 +44,7 @@
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -819,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
 		atomic_inc(&current->signal->live);
+		taskstats_tgid_alloc(current->signal);
 		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -863,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -884,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 void __cleanup_signal(struct signal_struct *sig)
 {
 	exit_thread_group_keys(sig);
+	taskstats_tgid_free(sig);
 	kmem_cache_free(signal_cachep, sig);
 }
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea9506de3b8..4a0a5022b29 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -132,46 +132,79 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		struct taskstats *stats)
 {
-	int rc;
 	struct task_struct *tsk, *first;
+	unsigned long flags;
 
+	/*
+	 * Add additional stats from live tasks except zombie thread group
+	 * leaders who are already counted with the dead tasks
+	 */
 	first = tgidtsk;
-	read_lock(&tasklist_lock);
 	if (!first) {
+		read_lock(&tasklist_lock);
 		first = find_task_by_pid(tgid);
 		if (!first) {
 			read_unlock(&tasklist_lock);
 			return -ESRCH;
 		}
-	}
+		get_task_struct(first);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(first);
+
+	/* Start with stats from dead tasks */
+	spin_lock_irqsave(&first->signal->stats_lock, flags);
+	if (first->signal->stats)
+		memcpy(stats, first->signal->stats, sizeof(*stats));
+	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+
 	tsk = first;
+	read_lock(&tasklist_lock);
 	do {
+		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+			continue;
 		/*
-		 * Each accounting subsystem adds calls its functions to
+		 * Accounting subsystem can call its functions here to
 		 * fill in relevant parts of struct taskstsats as follows
 		 *
-		 *	rc = per-task-foo(stats, tsk);
-		 *	if (rc)
-		 *		break;
+		 *	per-task-foo(stats, tsk);
 		 */
-
-		rc = delayacct_add_tsk(stats, tsk);
-		if (rc)
-			break;
+		delayacct_add_tsk(stats, tsk);
 
 	} while_each_thread(first, tsk);
 	read_unlock(&tasklist_lock);
 	stats->version = TASKSTATS_VERSION;
 
-
 	/*
-	 * Accounting subsytems can also add calls here if they don't
-	 * wish to aggregate statistics for per-tgid stats
+	 * Accounting subsytems can also add calls here to modify
+	 * fields of taskstats.
 	 */
 
-	return rc;
+	return 0;
+}
+
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	if (!tsk->signal->stats)
+		goto ret;
+
+	/*
+	 * Each accounting subsystem calls its functions here to
+	 * accumalate its per-task stats for tsk, into the per-tgid structure
+	 *
+	 *	per-task-foo(tsk->signal->stats, tsk);
+	 */
+	delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+	return;
 }
 
+
 static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
@@ -230,7 +263,7 @@ err:
 
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			struct taskstats *tgidstats)
+			int group_dead)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -238,13 +271,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	size_t size;
 	int is_thread_group;
 	struct nlattr *na;
+	unsigned long flags;
 
 	if (!family_registered || !tidstats)
 		return;
 
-	is_thread_group = !thread_group_empty(tsk);
-	rc = 0;
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	is_thread_group = tsk->signal->stats ? 1 : 0;
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
 
+	rc = 0;
 	/*
 	 * Size includes space for nested attributes
 	 */
@@ -268,30 +304,28 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 			*tidstats);
 	nla_nest_end(rep_skb, na);
 
-	if (!is_thread_group || !tgidstats) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+	if (!is_thread_group)
+		goto send;
 
-	rc = fill_tgid(tsk->pid, tsk, tgidstats);
 	/*
-	 * If fill_tgid() failed then one probable reason could be that the
-	 * thread group leader has exited. fill_tgid() will fail, send out
-	 * the pid statistics collected earlier.
+	 * tsk has/had a thread group so fill the tsk->signal->stats structure
+	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-	if (rc < 0) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+
+	fill_tgid_exit(tsk);
+	if (!group_dead)
+		goto send;
 
 	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
 	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	/* No locking needed for tsk->signal->stats since group is dead */
 	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-			*tgidstats);
+			*tsk->signal->stats);
 	nla_nest_end(rep_skb, na);
 
+send:
 	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-	goto ret;
+	return;
 
 nla_put_failure:
 	genlmsg_cancel(rep_skb, reply);
-- 
cgit v1.2.3-70-g09d2


From f9fd8914c1acca0d98b69d831b128d5b52f03c51 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:47 -0700
Subject: [PATCH] per-task delay accounting taskstats interface: control exit
 data through cpumasks

On systems with a large number of cpus, with even a modest rate of tasks
exiting per cpu, the volume of taskstats data sent on thread exit can
overflow a userspace listener's buffers.

One approach to avoiding overflow is to allow listeners to get data for a
limited and specific set of cpus.  By scaling the number of listeners
and/or the cpus they monitor, userspace can handle the statistical data
overload more gracefully.

In this patch, each listener registers to listen to a specific set of cpus
by specifying a cpumask.  The interest is recorded per-cpu.  When a task
exits on a cpu, its taskstats data is unicast to each listener interested
in that cpu.

Thanks to Andrew Morton for pointing out the various scalability and
general concerns of previous attempts and for suggesting this design.

[akpm@osdl.org: build fix]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/taskstats.h      |   4 +-
 include/linux/taskstats_kern.h |  27 +-----
 kernel/exit.c                  |   5 +-
 kernel/taskstats.c             | 200 ++++++++++++++++++++++++++++++++++++++---
 4 files changed, 198 insertions(+), 38 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index c6aeca32348..f1cb6cddd19 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -91,8 +91,6 @@ struct taskstats {
 };
 
 
-#define TASKSTATS_LISTEN_GROUP	0x1
-
 /*
  * Commands sent from userspace
  * Not versioned. New commands should only be inserted at the enum's end
@@ -124,6 +122,8 @@ enum {
 	TASKSTATS_CMD_ATTR_UNSPEC = 0,
 	TASKSTATS_CMD_ATTR_PID,
 	TASKSTATS_CMD_ATTR_TGID,
+	TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
+	TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
 	__TASKSTATS_CMD_ATTR_MAX,
 };
 
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 2b6adec3a2e..16894b7edcc 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -11,30 +11,10 @@
 #include <linux/sched.h>
 #include <net/genetlink.h>
 
-enum {
-	TASKSTATS_MSG_UNICAST,		/* send data only to requester */
-	TASKSTATS_MSG_MULTICAST,	/* send data to a group */
-};
-
 #ifdef CONFIG_TASKSTATS
 extern kmem_cache_t *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
 
-static inline int taskstats_has_listeners(void)
-{
-	if (!genl_sock)
-		return 0;
-	return netlink_has_listeners(genl_sock, TASKSTATS_LISTEN_GROUP);
-}
-
-
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
-{
-	*ptidstats = NULL;
-	if (taskstats_has_listeners())
-		*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
-}
-
 static inline void taskstats_exit_free(struct taskstats *tidstats)
 {
 	if (tidstats)
@@ -82,17 +62,18 @@ static inline void taskstats_tgid_free(struct signal_struct *sig)
 		kmem_cache_free(taskstats_cache, stats);
 }
 
-extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int);
+extern void taskstats_exit_alloc(struct taskstats **, unsigned int *);
+extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int, unsigned int);
 extern void taskstats_init_early(void);
 extern void taskstats_tgid_alloc(struct signal_struct *);
 #else
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 {}
 static inline void taskstats_exit_free(struct taskstats *ptidstats)
 {}
 static inline void taskstats_exit_send(struct task_struct *tsk,
 				       struct taskstats *tidstats,
-				       int group_dead)
+				       int group_dead, unsigned int cpu)
 {}
 static inline void taskstats_tgid_init(struct signal_struct *sig)
 {}
diff --git a/kernel/exit.c b/kernel/exit.c
index 67c1e9a4f81..dba194a8d41 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -847,6 +847,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	struct task_struct *tsk = current;
 	struct taskstats *tidstats;
 	int group_dead;
+	unsigned int mycpu;
 
 	profile_task_exit(tsk);
 
@@ -884,7 +885,7 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats);
+	taskstats_exit_alloc(&tidstats, &mycpu);
 
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
@@ -905,7 +906,7 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, group_dead);
+	taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
 	taskstats_exit_free(tidstats);
 	delayacct_tsk_exit(tsk);
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a0a5022b29..abb59e32354 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -19,9 +19,17 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
+
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
@@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
 __read_mostly = {
 	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
 	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+struct listener {
+	struct list_head list;
+	pid_t pid;
 };
 
+struct listener_list {
+	struct rw_semaphore sem;
+	struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+	REGISTER,
+	DEREGISTER,
+	CPU_DONT_CARE
+};
 
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 			void **replyp, size_t size)
@@ -74,25 +99,68 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	return 0;
 }
 
-static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-	void *reply;
+	void *reply = genlmsg_data(genlhdr);
 	int rc;
 
-	reply = genlmsg_data(genlhdr);
-
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
 		return rc;
 	}
 
-	if (event == TASKSTATS_MSG_MULTICAST)
-		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
 	return genlmsg_unicast(skb, pid);
 }
 
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	struct sk_buff *skb_next, *skb_cur = skb;
+	void *reply = genlmsg_data(genlhdr);
+	int rc, ret;
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	rc = 0;
+	listeners = &per_cpu(listener_array, cpu);
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		skb_next = NULL;
+		if (!list_is_last(&s->list, &listeners->list)) {
+			skb_next = skb_clone(skb_cur, GFP_KERNEL);
+			if (!skb_next) {
+				nlmsg_free(skb_cur);
+				rc = -ENOMEM;
+				break;
+			}
+		}
+		ret = genlmsg_unicast(skb_cur, s->pid);
+		if (ret == -ECONNREFUSED) {
+			list_del(&s->list);
+			kfree(s);
+			rc = ret;
+		}
+		skb_cur = skb_next;
+	}
+	up_write(&listeners->sem);
+
+	return rc;
+}
+
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 		struct taskstats *stats)
 {
@@ -204,8 +272,73 @@ ret:
 	return;
 }
 
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	unsigned int cpu;
+	cpumask_t mask = *maskp;
 
-static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+	if (!cpus_subset(mask, cpu_possible_map))
+		return -EINVAL;
+
+	if (isadd == REGISTER) {
+		for_each_cpu_mask(cpu, mask) {
+			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+					 cpu_to_node(cpu));
+			if (!s)
+				goto cleanup;
+			s->pid = pid;
+			INIT_LIST_HEAD(&s->list);
+
+			listeners = &per_cpu(listener_array, cpu);
+			down_write(&listeners->sem);
+			list_add(&s->list, &listeners->list);
+			up_write(&listeners->sem);
+		}
+		return 0;
+	}
+
+	/* Deregister or cleanup */
+cleanup:
+	for_each_cpu_mask(cpu, mask) {
+		listeners = &per_cpu(listener_array, cpu);
+		down_write(&listeners->sem);
+		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+			if (s->pid == pid) {
+				list_del(&s->list);
+				kfree(s);
+				break;
+			}
+		}
+		up_write(&listeners->sem);
+	}
+	return 0;
+}
+
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+	char *data;
+	int len;
+	int ret;
+
+	if (na == NULL)
+		return 1;
+	len = nla_len(na);
+	if (len > TASKSTATS_CPUMASK_MAXLEN)
+		return -E2BIG;
+	if (len < 1)
+		return -EINVAL;
+	data = kmalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	nla_strlcpy(data, na, len);
+	ret = cpulist_parse(data, *mask);
+	kfree(data);
+	return ret;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
 	struct sk_buff *rep_skb;
@@ -213,6 +346,19 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 	void *reply;
 	size_t size;
 	struct nlattr *na;
+	cpumask_t mask;
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, REGISTER);
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
 
 	/*
 	 * Size includes space for nested attributes
@@ -252,7 +398,7 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 
 	nla_nest_end(rep_skb, na);
 
-	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
 	return genlmsg_cancel(rep_skb, reply);
@@ -261,9 +407,35 @@ err:
 	return rc;
 }
 
+void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+{
+	struct listener_list *listeners;
+	struct taskstats *tmp;
+	/*
+	 * This is the cpu on which the task is exiting currently and will
+	 * be the one for which the exit event is sent, even if the cpu
+	 * on which this function is running changes later.
+	 */
+	*mycpu = raw_smp_processor_id();
+
+	*ptidstats = NULL;
+	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	if (!tmp)
+		return;
+
+	listeners = &per_cpu(listener_array, *mycpu);
+	down_read(&listeners->sem);
+	if (!list_empty(&listeners->list)) {
+		*ptidstats = tmp;
+		tmp = NULL;
+	}
+	up_read(&listeners->sem);
+	kfree(tmp);
+}
+
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			int group_dead)
+			int group_dead, unsigned int mycpu)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -324,7 +496,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	nla_nest_end(rep_skb, na);
 
 send:
-	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	send_cpu_listeners(rep_skb, mycpu);
 	return;
 
 nla_put_failure:
@@ -338,16 +510,22 @@ ret:
 
 static struct genl_ops taskstats_ops = {
 	.cmd		= TASKSTATS_CMD_GET,
-	.doit		= taskstats_send_stats,
+	.doit		= taskstats_user_cmd,
 	.policy		= taskstats_cmd_get_policy,
 };
 
 /* Needed early in initialization */
 void __init taskstats_init_early(void)
 {
+	unsigned int i;
+
 	taskstats_cache = kmem_cache_create("taskstats_cache",
 						sizeof(struct taskstats),
 						0, SLAB_PANIC, NULL, NULL);
+	for_each_possible_cpu(i) {
+		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+		init_rwsem(&(per_cpu(listener_array, i).sem));
+	}
 }
 
 static int __init taskstats_init(void)
-- 
cgit v1.2.3-70-g09d2


From bb129994c3bff9c5e8df91f05d7e9b6402fbd83f Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:47 -0700
Subject: [PATCH] Remove down_write() from taskstats code invoked on the exit()
 path

In send_cpu_listeners(), which is called on the exit path, a down_write()
was protecting operations like skb_clone() and genlmsg_unicast() that do
GFP_KERNEL allocations.  If the oom-killer decides to kill tasks to satisfy
the allocations,the exit of those tasks could block on the same semphore.

The down_write() was only needed to allow removal of invalid listeners from
the listener list.  The patch converts the down_write to a down_read and
defers the removal to a separate critical region.  This ensures that even
if the oom-killer is called, no other task's exit is blocked as it can
still acquire another down_read.

Thanks to Andrew Morton & Herbert Xu for pointing out the oom related
pitfalls, and to Chandra Seetharaman for suggesting this fix instead of
using something more complex like RCU.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index abb59e32354..f45179ce028 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -51,6 +51,7 @@ __read_mostly = {
 struct listener {
 	struct list_head list;
 	pid_t pid;
+	char valid;
 };
 
 struct listener_list {
@@ -127,7 +128,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, ret;
+	int rc, ret, delcount = 0;
 
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
@@ -137,7 +138,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 
 	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
-	down_write(&listeners->sem);
+	down_read(&listeners->sem);
 	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 		skb_next = NULL;
 		if (!list_is_last(&s->list, &listeners->list)) {
@@ -150,14 +151,26 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 		}
 		ret = genlmsg_unicast(skb_cur, s->pid);
 		if (ret == -ECONNREFUSED) {
-			list_del(&s->list);
-			kfree(s);
+			s->valid = 0;
+			delcount++;
 			rc = ret;
 		}
 		skb_cur = skb_next;
 	}
-	up_write(&listeners->sem);
+	up_read(&listeners->sem);
+
+	if (!delcount)
+		return rc;
 
+	/* Delete invalidated entries */
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		if (!s->valid) {
+			list_del(&s->list);
+			kfree(s);
+		}
+	}
+	up_write(&listeners->sem);
 	return rc;
 }
 
@@ -290,6 +303,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 				goto cleanup;
 			s->pid = pid;
 			INIT_LIST_HEAD(&s->list);
+			s->valid = 1;
 
 			listeners = &per_cpu(listener_array, cpu);
 			down_write(&listeners->sem);
-- 
cgit v1.2.3-70-g09d2


From 7d94dddd438bcba97db44f120da39bb001b5249f Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Sun, 30 Jul 2006 03:03:10 -0700
Subject: [PATCH] make taskstats sending completely independent of delay
 accounting on/off status

Complete the separation of delay accounting and taskstats by ignoring the
return value of delay accounting functions that fill in parts of taskstats
before it is sent out (either in response to a command or as part of a task
exit).

Also make delayacct_add_tsk return silently when delay accounting is turned
off rather than treat it as an error.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/delayacct.h | 4 +---
 kernel/taskstats.c        | 8 +++-----
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 7e8b6011b8f..8a284cc6fd5 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -80,9 +80,7 @@ static inline void delayacct_blkio_end(void)
 static inline int delayacct_add_tsk(struct taskstats *d,
 					struct task_struct *tsk)
 {
-	if (likely(!delayacct_on))
-		return -EINVAL;
-	if (!tsk->delays)
+	if (likely(!delayacct_on) || !tsk->delays)
 		return 0;
 	return __delayacct_add_tsk(d, tsk);
 }
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f45179ce028..b4c737a1140 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -177,7 +177,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 		struct taskstats *stats)
 {
-	int rc;
+	int rc = 0;
 	struct task_struct *tsk = pidtsk;
 
 	if (!pidtsk) {
@@ -196,12 +196,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 * Each accounting subsystem adds calls to its functions to
 	 * fill in relevant parts of struct taskstsats as follows
 	 *
-	 *	rc = per-task-foo(stats, tsk);
-	 *	if (rc)
-	 *		goto err;
+	 *	per-task-foo(stats, tsk);
 	 */
 
-	rc = delayacct_add_tsk(stats, tsk);
+	delayacct_add_tsk(stats, tsk);
 	stats->version = TASKSTATS_VERSION;
 
 	/* Define err: label here if needed */
-- 
cgit v1.2.3-70-g09d2


From d94a041519f3ab1ac023bf917619cd8c4a7d3c01 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Sun, 30 Jul 2006 03:03:11 -0700
Subject: [PATCH] taskstats: free skb, avoid returns in send_cpu_listeners

Add a missing freeing of skb in the case there are no listeners at all.
Also remove the returning of error values by the function as it is unused
by the sole caller.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b4c737a1140..e7818765733 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -121,46 +121,45 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 /*
  * Send taskstats data in @skb to listeners registered for @cpu's exit data
  */
-static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
 	struct listener_list *listeners;
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, ret, delcount = 0;
+	int rc, delcount = 0;
 
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
-		return rc;
+		return;
 	}
 
 	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
 	down_read(&listeners->sem);
-	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+	list_for_each_entry(s, &listeners->list, list) {
 		skb_next = NULL;
 		if (!list_is_last(&s->list, &listeners->list)) {
 			skb_next = skb_clone(skb_cur, GFP_KERNEL);
-			if (!skb_next) {
-				nlmsg_free(skb_cur);
-				rc = -ENOMEM;
+			if (!skb_next)
 				break;
-			}
 		}
-		ret = genlmsg_unicast(skb_cur, s->pid);
-		if (ret == -ECONNREFUSED) {
+		rc = genlmsg_unicast(skb_cur, s->pid);
+		if (rc == -ECONNREFUSED) {
 			s->valid = 0;
 			delcount++;
-			rc = ret;
 		}
 		skb_cur = skb_next;
 	}
 	up_read(&listeners->sem);
 
+	if (skb_cur)
+		nlmsg_free(skb_cur);
+
 	if (!delcount)
-		return rc;
+		return;
 
 	/* Delete invalidated entries */
 	down_write(&listeners->sem);
@@ -171,7 +170,6 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 		}
 	}
 	up_write(&listeners->sem);
-	return rc;
 }
 
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
-- 
cgit v1.2.3-70-g09d2