From b500531e6f5f234ed267bd7060ee06d144faf0ca Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Sat, 25 Mar 2006 03:07:01 -0800
Subject: [PATCH] Introduce FMODE_EXEC file flag

Introduce FMODE_EXEC file flag, to indicate that file is being opened for
execution.  This is useful for distributed filesystems to maintain
consistent behavior for returning ETXTBUSY when opening for write and
execution happens on different nodes.

akpm:

  Needed by Lustre at present.  I assume their objective to to work towards
  being able to install Lustre on an unmodified distro kernel, which seems
  sane.  It should have zero runtime cost.

  Trond and Chuck indicate that NFS4 can probably use this too, for the same
  thing.

  Steven says it's also on the GFS todo list.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Chuck Lever <cel@citi.umich.edu>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index 0b515ac5313..d8c477a5625 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -127,7 +127,7 @@ asmlinkage long sys_uselib(const char __user * library)
 	struct nameidata nd;
 	int error;
 
-	error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ);
+	error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 	if (error)
 		goto out;
 
@@ -477,7 +477,7 @@ struct file *open_exec(const char *name)
 	int err;
 	struct file *file;
 
-	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ);
+	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 	file = ERR_PTR(err);
 
 	if (!err) {
-- 
cgit v1.2.3-70-g09d2


From 11b0b5abb2097a63c1081d9b7e825b987b227972 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <neukum@fachschaft.cup.uni-muenchen.de>
Date: Sat, 25 Mar 2006 03:08:13 -0800
Subject: [PATCH] use kzalloc and kcalloc in core fs code

Signed-off-by: Oliver Neukum <oliver@neukum.name>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c        | 3 +--
 fs/binfmt_elf.c | 3 +--
 fs/bio.c        | 7 ++-----
 fs/char_dev.c   | 7 ++-----
 fs/compat.c     | 3 +--
 fs/exec.c       | 3 +--
 fs/pipe.c       | 3 +--
 fs/super.c      | 3 +--
 8 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/aio.c b/fs/aio.c
index aec2b1916d1..e41e932ba48 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -122,10 +122,9 @@ static int aio_setup_ring(struct kioctx *ctx)
 	info->nr = 0;
 	info->ring_pages = info->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
-		info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+		info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
 		if (!info->ring_pages)
 			return -ENOMEM;
-		memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
 	}
 
 	info->mmap_size = nr_pages * PAGE_SIZE;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c2eac2a50bd..61c21e7dc95 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1465,12 +1465,11 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 		read_lock(&tasklist_lock);
 		do_each_thread(g,p)
 			if (current->mm == p->mm && current != p) {
-				tmp = kmalloc(sizeof(*tmp), GFP_ATOMIC);
+				tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
 				if (!tmp) {
 					read_unlock(&tasklist_lock);
 					goto cleanup;
 				}
-				memset(tmp, 0, sizeof(*tmp));
 				INIT_LIST_HEAD(&tmp->list);
 				tmp->thread = p;
 				list_add(&tmp->list, &thread_list);
diff --git a/fs/bio.c b/fs/bio.c
index 0a8c59cb68f..73e664c01d3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -636,12 +636,10 @@ static struct bio *__bio_map_user_iov(request_queue_t *q,
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
 		goto out;
 
-	memset(pages, 0, nr_pages * sizeof(struct page *));
-
 	for (i = 0; i < iov_count; i++) {
 		unsigned long uaddr = (unsigned long)iov[i].iov_base;
 		unsigned long len = iov[i].iov_len;
@@ -1186,12 +1184,11 @@ void bioset_free(struct bio_set *bs)
 
 struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
 {
-	struct bio_set *bs = kmalloc(sizeof(*bs), GFP_KERNEL);
+	struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
 
 	if (!bs)
 		return NULL;
 
-	memset(bs, 0, sizeof(*bs));
 	bs->bio_pool = mempool_create(bio_pool_size, mempool_alloc_slab,
 			mempool_free_slab, bio_slab);
 
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 5c36345c9bf..8c6eb04d31e 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -146,12 +146,10 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 	int ret = 0;
 	int i;
 
-	cd = kmalloc(sizeof(struct char_device_struct), GFP_KERNEL);
+	cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL);
 	if (cd == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	memset(cd, 0, sizeof(struct char_device_struct));
-
 	mutex_lock(&chrdevs_lock);
 
 	/* temporary */
@@ -466,9 +464,8 @@ static struct kobj_type ktype_cdev_dynamic = {
 
 struct cdev *cdev_alloc(void)
 {
-	struct cdev *p = kmalloc(sizeof(struct cdev), GFP_KERNEL);
+	struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
 	if (p) {
-		memset(p, 0, sizeof(struct cdev));
 		p->kobj.ktype = &ktype_cdev_dynamic;
 		INIT_LIST_HEAD(&p->list);
 		kobject_init(&p->kobj);
diff --git a/fs/compat.c b/fs/compat.c
index 263990ae409..ef5a0771592 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1476,10 +1476,9 @@ int compat_do_execve(char * filename,
 	int i;
 
 	retval = -ENOMEM;
-	bprm = kmalloc(sizeof(*bprm), GFP_KERNEL);
+	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
 	if (!bprm)
 		goto out_ret;
-	memset(bprm, 0, sizeof(*bprm));
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
diff --git a/fs/exec.c b/fs/exec.c
index d8c477a5625..995cba3c62b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1143,10 +1143,9 @@ int do_execve(char * filename,
 	int i;
 
 	retval = -ENOMEM;
-	bprm = kmalloc(sizeof(*bprm), GFP_KERNEL);
+	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
 	if (!bprm)
 		goto out_ret;
-	memset(bprm, 0, sizeof(*bprm));
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
diff --git a/fs/pipe.c b/fs/pipe.c
index 8aada8e426f..d976866a115 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -662,10 +662,9 @@ struct inode* pipe_new(struct inode* inode)
 {
 	struct pipe_inode_info *info;
 
-	info = kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 	if (!info)
 		goto fail_page;
-	memset(info, 0, sizeof(*info));
 	inode->i_pipe = info;
 
 	init_waitqueue_head(PIPE_WAIT(*inode));
diff --git a/fs/super.c b/fs/super.c
index 37554b87618..8743e9bbb29 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -55,11 +55,10 @@ DEFINE_SPINLOCK(sb_lock);
  */
 static struct super_block *alloc_super(void)
 {
-	struct super_block *s = kmalloc(sizeof(struct super_block),  GFP_USER);
+	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 	static struct super_operations default_op;
 
 	if (s) {
-		memset(s, 0, sizeof(struct super_block));
 		if (security_sb_alloc(s)) {
 			kfree(s);
 			s = NULL;
-- 
cgit v1.2.3-70-g09d2


From 05cfb614ddbf3181540ce09d44d96486f8ba8d6a Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:12 -0800
Subject: [PATCH] hrtimers: remove data field

The nanosleep cleanup allows to remove the data field of hrtimer.  The
callback function can use container_of() to get it's own data.  Since the
hrtimer structure is anyway embedded in other structures, this adds no
overhead.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c               |  2 +-
 include/linux/hrtimer.h |  5 +----
 include/linux/sched.h   |  1 +
 include/linux/timer.h   |  3 ++-
 kernel/fork.c           |  2 +-
 kernel/hrtimer.c        | 12 +++++-------
 kernel/itimer.c         | 15 +++++++--------
 kernel/posix-timers.c   |  9 ++++-----
 8 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index 995cba3c62b..c7397c46ad6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -632,7 +632,7 @@ static int de_thread(struct task_struct *tsk)
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->real_timer.data = current;
+		sig->tsk = current;
 		spin_unlock_irq(lock);
 		if (hrtimer_cancel(&sig->real_timer))
 			hrtimer_restart(&sig->real_timer);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f57cc7bd700..93830158348 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -45,9 +45,7 @@ struct hrtimer_base;
  * @expires:	the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
  *		which the timer is based.
- * @state:	state of the timer
  * @function:	timer expiry callback function
- * @data:	argument for the callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  *
  * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE()
@@ -55,8 +53,7 @@ struct hrtimer_base;
 struct hrtimer {
 	struct rb_node		node;
 	ktime_t			expires;
-	int			(*function)(void *);
-	void			*data;
+	int			(*function)(struct hrtimer *);
 	struct hrtimer_base	*base;
 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e0054c1b9a0..036d14d2bf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -402,6 +402,7 @@ struct signal_struct {
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
+	struct task_struct *tsk;
 	ktime_t it_real_incr;
 
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index ee5a09e806e..b5caabca553 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -96,6 +96,7 @@ static inline void add_timer(struct timer_list *timer)
 
 extern void init_timers(void);
 extern void run_local_timers(void);
-extern int it_real_fn(void *);
+struct hrtimer;
+extern int it_real_fn(struct hrtimer *);
 
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a02063903aa..4bd6486aa67 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -848,7 +848,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
 	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = tsk;
+	sig->tsk = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 44108de4f02..0237a556eb1 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -613,21 +613,19 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		int (*fn)(void *);
+		int (*fn)(struct hrtimer *);
 		int restart;
-		void *data;
 
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
 		fn = timer->function;
-		data = timer->data;
 		set_curr_timer(base, timer);
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);
 
-		restart = fn(data);
+		restart = fn(timer);
 
 		spin_lock_irq(&base->lock);
 
@@ -664,9 +662,10 @@ struct sleep_hrtimer {
 	int expired;
 };
 
-static int nanosleep_wakeup(void *data)
+static int nanosleep_wakeup(struct hrtimer *timer)
 {
-	struct sleep_hrtimer *t = data;
+	struct sleep_hrtimer *t =
+		container_of(timer, struct sleep_hrtimer, timer);
 
 	t->expired = 1;
 	wake_up_process(t->task);
@@ -677,7 +676,6 @@ static int nanosleep_wakeup(void *data)
 static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
 {
 	t->timer.function = nanosleep_wakeup;
-	t->timer.data = t;
 	t->task = current;
 	t->expired = 0;
 
diff --git a/kernel/itimer.c b/kernel/itimer.c
index af2ec6b4392..204ed7939e7 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,17 +128,16 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 /*
  * The timer is automagically restarted, when interval != 0
  */
-int it_real_fn(void *data)
+int it_real_fn(struct hrtimer *timer)
 {
-	struct task_struct *tsk = (struct task_struct *) data;
+	struct signal_struct *sig =
+	    container_of(timer, struct signal_struct, real_timer);
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
-
-	if (tsk->signal->it_real_incr.tv64 != 0) {
-		hrtimer_forward(&tsk->signal->real_timer,
-				tsk->signal->real_timer.base->softirq_time,
-				tsk->signal->it_real_incr);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
 
+	if (sig->it_real_incr.tv64 != 0) {
+		hrtimer_forward(timer, timer->base->softirq_time,
+				sig->it_real_incr);
 		return HRTIMER_RESTART;
 	}
 	return HRTIMER_NORESTART;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7c5f44787c8..ac6dc874442 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
 static int common_timer_del(struct k_itimer *timer);
 
-static int posix_timer_fn(void *data);
+static int posix_timer_fn(struct hrtimer *data);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -334,14 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 
  * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
  */
-static int posix_timer_fn(void *data)
+static int posix_timer_fn(struct hrtimer *timer)
 {
-	struct k_itimer *timr = data;
-	struct hrtimer *timer = &timr->it.real.timer;
+	struct k_itimer *timr;
 	unsigned long flags;
 	int si_private = 0;
 	int ret = HRTIMER_NORESTART;
 
+	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
 
 	if (timr->it.real.interval.tv64 != 0)
@@ -725,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags,
 
 	mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
 	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
-	timr->it.real.timer.data = timr;
 	timr->it.real.timer.function = posix_timer_fn;
 
 	timer->expires = timespec_to_ktime(new_setting->it_value);
-- 
cgit v1.2.3-70-g09d2


From fef23e7fbb11a0a78cd61935f7056bc2b237995a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 28 Mar 2006 16:10:58 -0800
Subject: [PATCH] exec: allow init to exec from any thread.

After looking at the problem of init calling exec some more I figured out
an easy way to make the code work.

The actual symptom without out this patch is that all threads will die
except pid == 1, and the thread calling exec.  The thread calling exec will
wait forever for pid == 1 to die.

Since pid == 1 does not install a handler for SIGKILL it will never die.

This modifies the tests for init from current->pid == 1 to the equivalent
current == child_reaper.  And then it causes exec in the ugly case to
modify child_reaper.

The only weird symptom is that you wind up with an init process that
doesn't have the oldest start time on the box.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c       | 13 ++++++++++++-
 kernel/exit.c   |  2 +-
 kernel/signal.c |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index c7397c46ad6..d0ecea0781f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -660,12 +660,23 @@ static int de_thread(struct task_struct *tsk)
 		struct dentry *proc_dentry1, *proc_dentry2;
 		unsigned long ptrace;
 
+		leader = current->group_leader;
+		/*
+		 * If our leader is the child_reaper become
+		 * the child_reaper and resend SIGKILL signal.
+		 */
+		if (unlikely(leader == child_reaper)) {
+			write_lock(&tasklist_lock);
+			child_reaper = current;
+			zap_other_threads(current);
+			write_unlock(&tasklist_lock);
+		}
+
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
 		 * of the time.
 		 */
-		leader = current->group_leader;
 		while (leader->exit_state != EXIT_ZOMBIE)
 			yield();
 
diff --git a/kernel/exit.c b/kernel/exit.c
index a8c7efc7a68..223a8802b66 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -807,7 +807,7 @@ fastcall NORET_TYPE void do_exit(long code)
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
-	if (unlikely(tsk->pid == 1))
+	if (unlikely(tsk == child_reaper))
 		panic("Attempted to kill init!");
 
 	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 75f7341b0c3..dc8f91bf9f8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1990,7 +1990,7 @@ relock:
 			continue;
 
 		/* Init gets no signals it doesn't want.  */
-		if (current->pid == 1)
+		if (current == child_reaper)
 			continue;
 
 		if (sig_kernel_stop(signr)) {
-- 
cgit v1.2.3-70-g09d2


From 1434261c07bcebd5ef8b8a18f919fdee533b84e0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:10:59 -0800
Subject: [PATCH] simplify exec from init's subthread

I think it is enough to take tasklist_lock for reading while changing
child_reaper:

	Reparenting needs write_lock(tasklist_lock)

	Only one thread in a thread group can do exec()

	sighand->siglock garantees that get_signal_to_deliver()
	will not see a stale value of child_reaper.

This means that we can change child_reaper earlier, without calling
zap_other_threads() twice.

"child_reaper = current" is a NOOP when init does exec from main thread, we
don't care.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index d0ecea0781f..dd194923c52 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -616,6 +616,15 @@ static int de_thread(struct task_struct *tsk)
 		kmem_cache_free(sighand_cachep, newsighand);
 		return -EAGAIN;
 	}
+
+	/*
+	 * child_reaper ignores SIGKILL, change it now.
+	 * Reparenting needs write_lock on tasklist_lock,
+	 * so it is safe to do it under read_lock.
+	 */
+	if (unlikely(current->group_leader == child_reaper))
+		child_reaper = current;
+
 	zap_other_threads(current);
 	read_unlock(&tasklist_lock);
 
@@ -660,23 +669,12 @@ static int de_thread(struct task_struct *tsk)
 		struct dentry *proc_dentry1, *proc_dentry2;
 		unsigned long ptrace;
 
-		leader = current->group_leader;
-		/*
-		 * If our leader is the child_reaper become
-		 * the child_reaper and resend SIGKILL signal.
-		 */
-		if (unlikely(leader == child_reaper)) {
-			write_lock(&tasklist_lock);
-			child_reaper = current;
-			zap_other_threads(current);
-			write_unlock(&tasklist_lock);
-		}
-
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
 		 * of the time.
 		 */
+		leader = current->group_leader;
 		while (leader->exit_state != EXIT_ZOMBIE)
 			yield();
 
-- 
cgit v1.2.3-70-g09d2


From d73d65293e3e2de7e916a89c8da30be0948afab7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 28 Mar 2006 16:11:03 -0800
Subject: [PATCH] pidhash: kill switch_exec_pids

switch_exec_pids is only called from de_thread by way of exec, and it is
only called when we are exec'ing from a non thread group leader.

Currently switch_exec_pids gives the leader the pid of the thread and
unhashes and rehashes all of the process groups.  The leader is already in
the EXIT_DEAD state so no one cares about it's pids.  The only concern for
the leader is that __unhash_process called from release_task will function
correctly.  If we don't touch the leader at all we know that
__unhash_process will work fine so there is no need to touch the leader.

For the task becomming the thread group leader, we just need to give it the
pid of the old thread group leader, add it to the task list, and attach it
to the session and the process group of the thread group.

Currently de_thread is also adding the task to the task list which is just
silly.

Currently the only leader of __detach_pid besides detach_pid is
switch_exec_pids because of the ugly extra work that was being
performed.

So this patch removes switch_exec_pids because it is doing too much, it is
creating an unnecessary special case in pid.c, duing work duplicated in
de_thread, and generally obscuring what it is going on.

The necessary work is added to de_thread, and it seems to be a little
clearer there what is going on.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Kirill Korotaev <dev@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c           | 14 +++++++++++---
 include/linux/pid.h |  1 -
 kernel/pid.c        | 30 ------------------------------
 3 files changed, 11 insertions(+), 34 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index dd194923c52..db0769447d3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -708,7 +708,17 @@ static int de_thread(struct task_struct *tsk)
 		remove_parent(current);
 		remove_parent(leader);
 
-		switch_exec_pids(leader, current);
+
+		/* Become a process group leader with the old leader's pid.
+		 * Note: The old leader also uses thispid until release_task
+		 *       is called.  Odd but simple and correct.
+		 */
+		detach_pid(current, PIDTYPE_PID);
+		current->pid = leader->pid;
+		attach_pid(current, PIDTYPE_PID,  current->pid);
+		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
+		attach_pid(current, PIDTYPE_SID,  current->signal->session);
+		list_add_tail(&current->tasks, &init_task.tasks);
 
 		current->parent = current->real_parent = leader->real_parent;
 		leader->parent = leader->real_parent = child_reaper;
@@ -722,8 +732,6 @@ static int de_thread(struct task_struct *tsk)
 			__ptrace_link(current, parent);
 		}
 
-		list_del(&current->tasks);
-		list_add_tail(&current->tasks, &init_task.tasks);
 		current->exit_signal = SIGCHLD;
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 5b2fcb19d2d..099e70ecf7c 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -38,7 +38,6 @@ extern struct pid *FASTCALL(find_pid(enum pid_type, int));
 
 extern int alloc_pidmap(void);
 extern void FASTCALL(free_pidmap(int));
-extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread);
 
 #define do_each_task_pid(who, type, task)				\
 	if ((task = find_task_by_pid_type(type, who))) {		\
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc0724699..7781d999905 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,36 +217,6 @@ task_t *find_task_by_pid_type(int type, int nr)
 
 EXPORT_SYMBOL(find_task_by_pid_type);
 
-/*
- * This function switches the PIDs if a non-leader thread calls
- * sys_execve() - this must be done without releasing the PID.
- * (which a detach_pid() would eventually do.)
- */
-void switch_exec_pids(task_t *leader, task_t *thread)
-{
-	__detach_pid(leader, PIDTYPE_PID);
-	__detach_pid(leader, PIDTYPE_TGID);
-	__detach_pid(leader, PIDTYPE_PGID);
-	__detach_pid(leader, PIDTYPE_SID);
-
-	__detach_pid(thread, PIDTYPE_PID);
-	__detach_pid(thread, PIDTYPE_TGID);
-
-	leader->pid = leader->tgid = thread->pid;
-	thread->pid = thread->tgid;
-
-	attach_pid(thread, PIDTYPE_PID, thread->pid);
-	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-	attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
-	attach_pid(thread, PIDTYPE_SID, thread->signal->session);
-	list_add_tail(&thread->tasks, &init_task.tasks);
-
-	attach_pid(leader, PIDTYPE_PID, leader->pid);
-	attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-	attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
-	attach_pid(leader, PIDTYPE_SID, leader->signal->session);
-}
-
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
-- 
cgit v1.2.3-70-g09d2


From 8fafabd86f1b75ed3cc6a6ffbe6c3e53e3d8457d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:05 -0800
Subject: [PATCH] remove add_parent()'s parent argument

add_parent(p, parent) is always called with parent == p->parent, and it makes
no sense to do it differently.  This patch removes this argument.

No changes in affected .o files.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/kernel/irixsig.c | 4 ++--
 fs/exec.c                  | 4 ++--
 include/linux/sched.h      | 4 ++--
 kernel/exit.c              | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs/exec.c')

diff --git a/arch/mips/kernel/irixsig.c b/arch/mips/kernel/irixsig.c
index 08273a2a501..8150f071f80 100644
--- a/arch/mips/kernel/irixsig.c
+++ b/arch/mips/kernel/irixsig.c
@@ -603,7 +603,7 @@ repeat:
 			/* move to end of parent's list to avoid starvation */
 			write_lock_irq(&tasklist_lock);
 			remove_parent(p);
-			add_parent(p, p->parent);
+			add_parent(p);
 			write_unlock_irq(&tasklist_lock);
 			retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 			if (retval)
@@ -643,7 +643,7 @@ repeat:
 				write_lock_irq(&tasklist_lock);
 				remove_parent(p);
 				p->parent = p->real_parent;
-				add_parent(p, p->parent);
+				add_parent(p);
 				do_notify_parent(p, SIGCHLD);
 				write_unlock_irq(&tasklist_lock);
 			} else
diff --git a/fs/exec.c b/fs/exec.c
index db0769447d3..9046ad2b061 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -725,8 +725,8 @@ static int de_thread(struct task_struct *tsk)
 		current->group_leader = current;
 		leader->group_leader = leader;
 
-		add_parent(current, current->parent);
-		add_parent(leader, leader->parent);
+		add_parent(current);
+		add_parent(leader);
 		if (ptrace) {
 			current->ptrace = ptrace;
 			__ptrace_link(current, parent);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f5ab98bbb6..b4b14c32b28 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1184,7 +1184,7 @@ extern void wait_task_inactive(task_t * p);
 #endif
 
 #define remove_parent(p)	list_del_init(&(p)->sibling)
-#define add_parent(p, parent)	list_add_tail(&(p)->sibling,&(parent)->children)
+#define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
 
 #define REMOVE_LINKS(p) do {					\
 	if (thread_group_leader(p))				\
@@ -1195,7 +1195,7 @@ extern void wait_task_inactive(task_t * p);
 #define SET_LINKS(p) do {					\
 	if (thread_group_leader(p))				\
 		list_add_tail(&(p)->tasks,&init_task.tasks);	\
-	add_parent(p, (p)->parent);				\
+	add_parent(p);						\
 	} while (0)
 
 #define next_task(p)	list_entry((p)->tasks.next, struct task_struct, tasks)
diff --git a/kernel/exit.c b/kernel/exit.c
index e04a59405e9..df26c33037d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1281,7 +1281,7 @@ bail_ref:
 
 	/* move to end of parent's list to avoid starvation */
 	remove_parent(p);
-	add_parent(p, p->parent);
+	add_parent(p);
 
 	write_unlock_irq(&tasklist_lock);
 
-- 
cgit v1.2.3-70-g09d2


From aa1757f90bea3f598b6e5d04d922a6a60200f1da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:12 -0800
Subject: [PATCH] convert sighand_cache to use SLAB_DESTROY_BY_RCU

This patch borrows a clever Hugh's 'struct anon_vma' trick.

Without tasklist_lock held we can't trust task->sighand until we locked it
and re-checked that it is still the same.

But this means we don't need to defer 'kmem_cache_free(sighand)'.  We can
return the memory to slab immediately, all we need is to be sure that
sighand->siglock can't dissapear inside rcu protected section.

To do so we need to initialize ->siglock inside ctor function,
SLAB_DESTROY_BY_RCU does the rest.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  3 +--
 include/linux/sched.h |  8 --------
 kernel/fork.c         | 21 +++++++++++----------
 kernel/signal.c       |  2 +-
 4 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'fs/exec.c')

diff --git a/fs/exec.c b/fs/exec.c
index 9046ad2b061..950ebd43cdc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -768,7 +768,6 @@ no_thread_group:
 		/*
 		 * Move our state over to newsighand and switch it in.
 		 */
-		spin_lock_init(&newsighand->siglock);
 		atomic_set(&newsighand->count, 1);
 		memcpy(newsighand->action, oldsighand->action,
 		       sizeof(newsighand->action));
@@ -785,7 +784,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			sighand_free(oldsighand);
+			kmem_cache_free(sighand_cachep, oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ddc0df7f8bf..bbcfc873bd9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -355,16 +355,8 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
-	struct rcu_head		rcu;
 };
 
-extern void sighand_free_cb(struct rcu_head *rhp);
-
-static inline void sighand_free(struct sighand_struct *sp)
-{
-	call_rcu(&sp->rcu, sighand_free_cb);
-}
-
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c32e28cdc5..33ffb5bf0db 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -786,14 +786,6 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
-void sighand_free_cb(struct rcu_head *rhp)
-{
-	struct sighand_struct *sp;
-
-	sp = container_of(rhp, struct sighand_struct, rcu);
-	kmem_cache_free(sighand_cachep, sp);
-}
-
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -806,7 +798,6 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
-	spin_lock_init(&sig->siglock);
 	atomic_set(&sig->count, 1);
 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 	return 0;
@@ -1356,11 +1347,21 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
+static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct sighand_struct *sighand = data;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+					SLAB_CTOR_CONSTRUCTOR)
+		spin_lock_init(&sighand->siglock);
+}
+
 void __init proc_caches_init(void)
 {
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+			sighand_ctor, NULL);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index dc8f91bf9f8..b0b1ca9daa3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -330,7 +330,7 @@ void __exit_sighand(struct task_struct *tsk)
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		sighand_free(sighand);
+		kmem_cache_free(sighand_cachep, sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
-- 
cgit v1.2.3-70-g09d2