From cf475ad28ac35cc9ba612d67158f29b73b38b05d Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Tue, 29 Apr 2008 01:00:16 -0700
Subject: cgroups: add an owner to the mm_struct

Remove the mem_cgroup member from mm_struct and instead adds an owner.

This approach was suggested by Paul Menage.  The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined.  It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.

A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.

This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes.  The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.

I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.

This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.

After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2a9d98c641a..ae0f2c4e452 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk)
 
 EXPORT_SYMBOL_GPL(exit_fs);
 
+#ifdef CONFIG_MM_OWNER
+/*
+ * Task p is exiting and it owned mm, lets find a new owner for it
+ */
+static inline int
+mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
+{
+	/*
+	 * If there are other users of the mm and the owner (us) is exiting
+	 * we need to find a new owner to take on the responsibility.
+	 */
+	if (!mm)
+		return 0;
+	if (atomic_read(&mm->mm_users) <= 1)
+		return 0;
+	if (mm->owner != p)
+		return 0;
+	return 1;
+}
+
+void mm_update_next_owner(struct mm_struct *mm)
+{
+	struct task_struct *c, *g, *p = current;
+
+retry:
+	if (!mm_need_new_owner(mm, p))
+		return;
+
+	read_lock(&tasklist_lock);
+	/*
+	 * Search in the children
+	 */
+	list_for_each_entry(c, &p->children, sibling) {
+		if (c->mm == mm)
+			goto assign_new_owner;
+	}
+
+	/*
+	 * Search in the siblings
+	 */
+	list_for_each_entry(c, &p->parent->children, sibling) {
+		if (c->mm == mm)
+			goto assign_new_owner;
+	}
+
+	/*
+	 * Search through everything else. We should not get
+	 * here often
+	 */
+	do_each_thread(g, c) {
+		if (c->mm == mm)
+			goto assign_new_owner;
+	} while_each_thread(g, c);
+
+	read_unlock(&tasklist_lock);
+	return;
+
+assign_new_owner:
+	BUG_ON(c == p);
+	get_task_struct(c);
+	/*
+	 * The task_lock protects c->mm from changing.
+	 * We always want mm->owner->mm == mm
+	 */
+	task_lock(c);
+	/*
+	 * Delay read_unlock() till we have the task_lock()
+	 * to ensure that c does not slip away underneath us
+	 */
+	read_unlock(&tasklist_lock);
+	if (c->mm != mm) {
+		task_unlock(c);
+		put_task_struct(c);
+		goto retry;
+	}
+	cgroup_mm_owner_callbacks(mm->owner, c);
+	mm->owner = c;
+	task_unlock(c);
+	put_task_struct(c);
+}
+#endif /* CONFIG_MM_OWNER */
+
 /*
  * Turn us into a lazy TLB process if we
  * aren't already..
@@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk)
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
 	task_unlock(tsk);
+	mm_update_next_owner(mm);
 	mmput(mm);
 }
 
-- 
cgit v1.2.3-70-g09d2


From bfc4b0890af566940de6e7aeb4b5faf46d3c3513 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:36 -0700
Subject: signals: do_group_exit(): use signal_group_exit() more consistently

do_group_exit() checks SIGNAL_GROUP_EXIT to avoid taking sighand->siglock.
Since ed5d2cac114202fe2978a9cbcab8f5032796d538 exec() doesn't set this
flag, we should use signal_group_exit().

This is not needed for correctness, but can speedup the multithreaded exec
and makes the code more consistent.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index ae0f2c4e452..6d019aa8522 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1115,12 +1115,13 @@ asmlinkage long sys_exit(int error_code)
 NORET_TYPE void
 do_group_exit(int exit_code)
 {
+	struct signal_struct *sig = current->signal;
+
 	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 
-	if (current->signal->flags & SIGNAL_GROUP_EXIT)
-		exit_code = current->signal->group_exit_code;
+	if (signal_group_exit(sig))
+		exit_code = sig->group_exit_code;
 	else if (!thread_group_empty(current)) {
-		struct signal_struct *const sig = current->signal;
 		struct sighand_struct *const sighand = current->sighand;
 		spin_lock_irq(&sighand->siglock);
 		if (signal_group_exit(sig))
-- 
cgit v1.2.3-70-g09d2


From d839fd4d2e95a5fbc4d50aa9d17eed6a5f2094e6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:11 -0700
Subject: ptrace: introduce task_detached() helper

exit.c has numerous "->exit_signal == -1" comparisons, this check is subtle
and deserves a helper.  Imho makes the code more parseable for humans.  At
least it's surely more greppable.

Also, a couple of whitespace cleanups. No functional changes.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6d019aa8522..4035d391a0d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,11 @@
 
 static void exit_mm(struct task_struct * tsk);
 
+static inline int task_detached(struct task_struct *p)
+{
+	return p->exit_signal == -1;
+}
+
 static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
@@ -160,7 +165,7 @@ repeat:
 	zap_leader = 0;
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
-		BUG_ON(leader->exit_signal == -1);
+		BUG_ON(task_detached(leader));
 		do_notify_parent(leader, leader->exit_signal);
 		/*
 		 * If we were the last child thread and the leader has
@@ -170,7 +175,7 @@ repeat:
 		 * do_notify_parent() will have marked it self-reaping in
 		 * that case.
 		 */
-		zap_leader = (leader->exit_signal == -1);
+		zap_leader = task_detached(leader);
 	}
 
 	write_unlock_irq(&tasklist_lock);
@@ -721,14 +726,14 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 		return;
 
 	/* We don't want people slaying init.  */
-	if (p->exit_signal != -1)
+	if (!task_detached(p))
 		p->exit_signal = SIGCHLD;
 
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
 	if (!traced && p->exit_state == EXIT_ZOMBIE &&
-	    p->exit_signal != -1 && thread_group_empty(p))
+	    !task_detached(p) && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
 	kill_orphaned_pgrp(p, father);
@@ -781,18 +786,18 @@ static void forget_original_parent(struct task_struct *father)
 		} else {
 			/* reparent ptraced task to its real parent */
 			__ptrace_unlink (p);
-			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
+			if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
 			    thread_group_empty(p))
 				do_notify_parent(p, p->exit_signal);
 		}
 
 		/*
-		 * if the ptraced child is a zombie with exit_signal == -1
-		 * we must collect it before we exit, or it will remain
-		 * zombie forever since we prevented it from self-reap itself
-		 * while it was being traced by us, to be able to see it in wait4.
+		 * if the ptraced child is a detached zombie we must collect
+		 * it before we exit, or it will remain zombie forever since
+		 * we prevented it from self-reap itself while it was being
+		 * traced by us, to be able to see it in wait4.
 		 */
-		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
+		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
 			list_add(&p->ptrace_list, &ptrace_dead);
 	}
 
@@ -849,26 +854,26 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 * we have changed execution domain as these two values started
 	 * the same after a fork.
 	 */
-	if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
+	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
 	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-	     tsk->self_exec_id != tsk->parent_exec_id)
-	    && !capable(CAP_KILL))
+	     tsk->self_exec_id != tsk->parent_exec_id) &&
+	    !capable(CAP_KILL))
 		tsk->exit_signal = SIGCHLD;
 
-
 	/* If something other than our normal parent is ptracing us, then
 	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
 	 * only has special meaning to our real parent.
 	 */
-	if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
-		int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
+	if (!task_detached(tsk) && thread_group_empty(tsk)) {
+		int signal = (tsk->parent == tsk->real_parent)
+				? tsk->exit_signal : SIGCHLD;
 		do_notify_parent(tsk, signal);
 	} else if (tsk->ptrace) {
 		do_notify_parent(tsk, SIGCHLD);
 	}
 
 	state = EXIT_ZOMBIE;
-	if (tsk->exit_signal == -1 && likely(!tsk->ptrace))
+	if (task_detached(tsk) && likely(!tsk->ptrace))
 		state = EXIT_DEAD;
 	tsk->exit_state = state;
 
@@ -1173,7 +1178,7 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 	 * Do not consider detached threads that are
 	 * not ptraced:
 	 */
-	if (p->exit_signal == -1 && !p->ptrace)
+	if (task_detached(p) && !p->ptrace)
 		return 0;
 
 	/* Wait for all children (clone and not) if __WALL is set;
@@ -1365,9 +1370,9 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 		 * If it's still not detached after that, don't release
 		 * it now.
 		 */
-		if (p->exit_signal != -1) {
+		if (!task_detached(p)) {
 			do_notify_parent(p, p->exit_signal);
-			if (p->exit_signal != -1) {
+			if (!task_detached(p)) {
 				p->exit_state = EXIT_ZOMBIE;
 				p = NULL;
 			}
-- 
cgit v1.2.3-70-g09d2


From 376e1d2531860358c8a79fecf5f4f42994d03c4d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:12 -0700
Subject: reparent_thread: use same_thread_group()

Trivial, use same_thread_group() in reparent_thread().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4035d391a0d..413c81ec858 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -722,7 +722,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
-	if (p->real_parent->group_leader == father->group_leader)
+	if (same_thread_group(p->real_parent, father))
 		return;
 
 	/* We don't want people slaying init.  */
-- 
cgit v1.2.3-70-g09d2


From 2800d8d19e51414403df8144eaa214bb03400b87 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:12 -0700
Subject: document de_thread() with exit_notify() connection

Add a couple of small comments, it is not easy to see what this code does.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     | 2 +-
 kernel/exit.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/exit.c')

diff --git a/fs/exec.c b/fs/exec.c
index 8fccc276d40..9f9f931ef94 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -798,7 +798,7 @@ static int de_thread(struct task_struct *tsk)
 	if (!thread_group_leader(tsk)) {
 		leader = tsk->group_leader;
 
-		sig->notify_count = -1;
+		sig->notify_count = -1;	/* for exit_notify() */
 		for (;;) {
 			write_lock_irq(&tasklist_lock);
 			if (likely(leader->exit_state))
diff --git a/kernel/exit.c b/kernel/exit.c
index 413c81ec858..879ed6e1c88 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -877,6 +877,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		state = EXIT_DEAD;
 	tsk->exit_state = state;
 
+	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
 	    tsk->signal->notify_count < 0 &&
 	    tsk->signal->group_exit_task)
-- 
cgit v1.2.3-70-g09d2


From 53b6f9fbd3b63af14b4f6268e8b5b80d178d05bc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:13 -0700
Subject: ptrace: introduce ptrace_reparented() helper

Add another trivial helper for the sake of grep.  It also auto-documents the
fact that ->parent != real_parent implies ->ptrace.

No functional changes.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 4 ++++
 kernel/exit.c          | 9 ++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ebe0c17039c..f98501ba557 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -98,6 +98,10 @@ extern void ptrace_untrace(struct task_struct *child);
 extern int ptrace_may_attach(struct task_struct *task);
 extern int __ptrace_may_attach(struct task_struct *task);
 
+static inline int ptrace_reparented(struct task_struct *child)
+{
+	return child->real_parent != child->parent;
+}
 static inline void ptrace_link(struct task_struct *child,
 			       struct task_struct *new_parent)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index 879ed6e1c88..0da2921b1e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -698,7 +698,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	if (unlikely(traced)) {
 		/* Preserve ptrace links if someone else is tracing this child.  */
 		list_del_init(&p->ptrace_list);
-		if (p->parent != p->real_parent)
+		if (ptrace_reparented(p))
 			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
 	} else {
 		/* If this child is being traced, then we're the one tracing it
@@ -865,8 +865,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 * only has special meaning to our real parent.
 	 */
 	if (!task_detached(tsk) && thread_group_empty(tsk)) {
-		int signal = (tsk->parent == tsk->real_parent)
-				? tsk->exit_signal : SIGCHLD;
+		int signal = ptrace_reparented(tsk) ?
+				SIGCHLD : tsk->exit_signal;
 		do_notify_parent(tsk, signal);
 	} else if (tsk->ptrace) {
 		do_notify_parent(tsk, SIGCHLD);
@@ -1269,8 +1269,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 		return 0;
 	}
 
-	/* traced means p->ptrace, but not vice versa */
-	traced = (p->real_parent != p->parent);
+	traced = ptrace_reparented(p);
 
 	if (likely(!traced)) {
 		struct signal_struct *psig;
-- 
cgit v1.2.3-70-g09d2


From 7d8da0962eaee30b4a380ded177349bfbdd6ac46 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:54:27 -0700
Subject: pids: __set_special_pids: use change_pid() helper

Use change_pid() instead of detach_pid() + attach_pid() in
__set_special_pids().

This way task_session() is not NULL in between.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc:  "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 0da2921b1e7..d3ad54677f9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -334,13 +334,11 @@ void __set_special_pids(struct pid *pid)
 	pid_t nr = pid_nr(pid);
 
 	if (task_session(curr) != pid) {
-		detach_pid(curr, PIDTYPE_SID);
-		attach_pid(curr, PIDTYPE_SID, pid);
+		change_pid(curr, PIDTYPE_SID, pid);
 		set_task_session(curr, nr);
 	}
 	if (task_pgrp(curr) != pid) {
-		detach_pid(curr, PIDTYPE_PGID);
-		attach_pid(curr, PIDTYPE_PGID, pid);
+		change_pid(curr, PIDTYPE_PGID, pid);
 		set_task_pgrp(curr, nr);
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From 9f3acc3140444a900ab280de942291959f0f615d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Apr 2008 07:44:08 -0400
Subject: [PATCH] split linux/file.h

Initial splitoff of the low-level stuff; taken to fdtable.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/mips/kernel/kspd.c                      |  1 +
 arch/powerpc/platforms/cell/spufs/coredump.c |  1 +
 drivers/char/tty_audit.c                     |  1 +
 drivers/char/tty_io.c                        |  1 +
 fs/compat.c                                  |  1 +
 fs/dnotify.c                                 |  2 +-
 fs/exec.c                                    |  1 +
 fs/fcntl.c                                   |  1 +
 fs/file.c                                    |  1 +
 fs/file_table.c                              |  1 +
 fs/locks.c                                   |  1 +
 fs/open.c                                    |  1 +
 fs/proc/array.c                              |  1 +
 fs/proc/base.c                               |  1 +
 fs/select.c                                  |  1 +
 include/linux/fdtable.h                      | 99 ++++++++++++++++++++++++++++
 include/linux/file.h                         | 86 +-----------------------
 include/linux/init_task.h                    |  2 +-
 kernel/exit.c                                |  1 +
 kernel/fork.c                                |  1 +
 kernel/kmod.c                                |  1 +
 security/selinux/hooks.c                     |  1 +
 22 files changed, 121 insertions(+), 86 deletions(-)
 create mode 100644 include/linux/fdtable.h

(limited to 'kernel/exit.c')

diff --git a/arch/mips/kernel/kspd.c b/arch/mips/kernel/kspd.c
index 998c4efcce8..ceb62dce1c9 100644
--- a/arch/mips/kernel/kspd.c
+++ b/arch/mips/kernel/kspd.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/syscalls.h>
 #include <linux/workqueue.h>
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index b962c3ab470..af116aadba1 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -22,6 +22,7 @@
 
 #include <linux/elf.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/module.h>
diff --git a/drivers/char/tty_audit.c b/drivers/char/tty_audit.c
index 6342b0534f4..3582f43345a 100644
--- a/drivers/char/tty_audit.c
+++ b/drivers/char/tty_audit.c
@@ -11,6 +11,7 @@
 
 #include <linux/audit.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/tty.h>
 
 struct tty_audit_buf {
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 1d298c2cf93..49c1a2267a5 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -78,6 +78,7 @@
 #include <linux/tty_flip.h>
 #include <linux/devpts_fs.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/console.h>
 #include <linux/timer.h>
 #include <linux/ctype.h>
diff --git a/fs/compat.c b/fs/compat.c
index 139dc93c092..332a869d2c5 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -24,6 +24,7 @@
 #include <linux/fcntl.h>
 #include <linux/namei.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/vfs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
diff --git a/fs/dnotify.c b/fs/dnotify.c
index eaecc4cfe54..676073b8dda 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -20,7 +20,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
-#include <linux/file.h>
+#include <linux/fdtable.h>
 
 int dir_notify_enable __read_mostly = 1;
 
diff --git a/fs/exec.c b/fs/exec.c
index 9f9f931ef94..aeaa9791d8b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -24,6 +24,7 @@
 
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/mman.h>
 #include <linux/a.out.h>
 #include <linux/stat.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3f3ac630ccd..bfd776509a7 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/capability.h>
 #include <linux/dnotify.h>
 #include <linux/smp_lock.h>
diff --git a/fs/file.c b/fs/file.c
index 5110acb1c9e..f6fbcb49faf 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/bitops.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
diff --git a/fs/file_table.c b/fs/file_table.c
index 7a0a9b87225..83084225b4c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -8,6 +8,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/fs/locks.c b/fs/locks.c
index 44d9a6a7ec5..663c069b59b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -116,6 +116,7 @@
 
 #include <linux/capability.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/module.h>
diff --git a/fs/open.c b/fs/open.c
index 7af1f05d597..a1450086e92 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/quotaops.h>
 #include <linux/fsnotify.h>
 #include <linux/module.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c135cbdd912..dca997a93bf 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -73,6 +73,7 @@
 #include <linux/signal.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/times.h>
 #include <linux/cpuset.h>
 #include <linux/rcupdate.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index fcf02f2deeb..808cbdc193d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -56,6 +56,7 @@
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/string.h>
 #include <linux/seq_file.h>
 #include <linux/namei.h>
diff --git a/fs/select.c b/fs/select.c
index 80d70387e79..8dda969614a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -21,6 +21,7 @@
 #include <linux/poll.h>
 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
 
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
new file mode 100644
index 00000000000..a118f3c0b24
--- /dev/null
+++ b/include/linux/fdtable.h
@@ -0,0 +1,99 @@
+/*
+ * descriptor table internals; you almost certainly want file.h instead.
+ */
+
+#ifndef __LINUX_FDTABLE_H
+#define __LINUX_FDTABLE_H
+
+#include <asm/atomic.h>
+#include <linux/posix_types.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+
+/*
+ * The default fd array needs to be at least BITS_PER_LONG,
+ * as this is the granularity returned by copy_fdset().
+ */
+#define NR_OPEN_DEFAULT BITS_PER_LONG
+
+/*
+ * The embedded_fd_set is a small fd_set,
+ * suitable for most tasks (which open <= BITS_PER_LONG files)
+ */
+struct embedded_fd_set {
+	unsigned long fds_bits[1];
+};
+
+struct fdtable {
+	unsigned int max_fds;
+	struct file ** fd;      /* current fd array */
+	fd_set *close_on_exec;
+	fd_set *open_fds;
+	struct rcu_head rcu;
+	struct fdtable *next;
+};
+
+/*
+ * Open file table structure
+ */
+struct files_struct {
+  /*
+   * read mostly part
+   */
+	atomic_t count;
+	struct fdtable *fdt;
+	struct fdtable fdtab;
+  /*
+   * written part on a separate cache line in SMP
+   */
+	spinlock_t file_lock ____cacheline_aligned_in_smp;
+	int next_fd;
+	struct embedded_fd_set close_on_exec_init;
+	struct embedded_fd_set open_fds_init;
+	struct file * fd_array[NR_OPEN_DEFAULT];
+};
+
+#define files_fdtable(files) (rcu_dereference((files)->fdt))
+
+extern struct kmem_cache *filp_cachep;
+
+struct file_operations;
+struct vfsmount;
+struct dentry;
+
+extern int expand_files(struct files_struct *, int nr);
+extern void free_fdtable_rcu(struct rcu_head *rcu);
+extern void __init files_defer_init(void);
+
+static inline void free_fdtable(struct fdtable *fdt)
+{
+	call_rcu(&fdt->rcu, free_fdtable_rcu);
+}
+
+static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
+{
+	struct file * file = NULL;
+	struct fdtable *fdt = files_fdtable(files);
+
+	if (fd < fdt->max_fds)
+		file = rcu_dereference(fdt->fd[fd]);
+	return file;
+}
+
+/*
+ * Check whether the specified fd has an open file.
+ */
+#define fcheck(fd)	fcheck_files(current->files, fd)
+
+struct task_struct;
+
+struct files_struct *get_files_struct(struct task_struct *);
+void put_files_struct(struct files_struct *fs);
+void reset_files_struct(struct files_struct *);
+int unshare_files(struct files_struct **);
+
+extern struct kmem_cache *files_cachep;
+
+#endif /* __LINUX_FDTABLE_H */
diff --git a/include/linux/file.h b/include/linux/file.h
index 69baf5a4f0a..27c64bdc68c 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -5,59 +5,11 @@
 #ifndef __LINUX_FILE_H
 #define __LINUX_FILE_H
 
-#include <asm/atomic.h>
-#include <linux/posix_types.h>
 #include <linux/compiler.h>
-#include <linux/spinlock.h>
-#include <linux/rcupdate.h>
 #include <linux/types.h>
+#include <linux/posix_types.h>
 
-/*
- * The default fd array needs to be at least BITS_PER_LONG,
- * as this is the granularity returned by copy_fdset().
- */
-#define NR_OPEN_DEFAULT BITS_PER_LONG
-
-/*
- * The embedded_fd_set is a small fd_set,
- * suitable for most tasks (which open <= BITS_PER_LONG files)
- */
-struct embedded_fd_set {
-	unsigned long fds_bits[1];
-};
-
-struct fdtable {
-	unsigned int max_fds;
-	struct file ** fd;      /* current fd array */
-	fd_set *close_on_exec;
-	fd_set *open_fds;
-	struct rcu_head rcu;
-	struct fdtable *next;
-};
-
-/*
- * Open file table structure
- */
-struct files_struct {
-  /*
-   * read mostly part
-   */
-	atomic_t count;
-	struct fdtable *fdt;
-	struct fdtable fdtab;
-  /*
-   * written part on a separate cache line in SMP
-   */
-	spinlock_t file_lock ____cacheline_aligned_in_smp;
-	int next_fd;
-	struct embedded_fd_set close_on_exec_init;
-	struct embedded_fd_set open_fds_init;
-	struct file * fd_array[NR_OPEN_DEFAULT];
-};
-
-#define files_fdtable(files) (rcu_dereference((files)->fdt))
-
-extern struct kmem_cache *filp_cachep;
+struct file;
 
 extern void __fput(struct file *);
 extern void fput(struct file *);
@@ -85,41 +37,7 @@ extern void put_filp(struct file *);
 extern int get_unused_fd(void);
 extern int get_unused_fd_flags(int flags);
 extern void put_unused_fd(unsigned int fd);
-struct kmem_cache;
-
-extern int expand_files(struct files_struct *, int nr);
-extern void free_fdtable_rcu(struct rcu_head *rcu);
-extern void __init files_defer_init(void);
-
-static inline void free_fdtable(struct fdtable *fdt)
-{
-	call_rcu(&fdt->rcu, free_fdtable_rcu);
-}
-
-static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
-{
-	struct file * file = NULL;
-	struct fdtable *fdt = files_fdtable(files);
-
-	if (fd < fdt->max_fds)
-		file = rcu_dereference(fdt->fd[fd]);
-	return file;
-}
-
-/*
- * Check whether the specified fd has an open file.
- */
-#define fcheck(fd)	fcheck_files(current->files, fd)
 
 extern void fd_install(unsigned int fd, struct file *file);
 
-struct task_struct;
-
-struct files_struct *get_files_struct(struct task_struct *);
-void put_files_struct(struct files_struct *fs);
-void reset_files_struct(struct files_struct *);
-int unshare_files(struct files_struct **);
-
-extern struct kmem_cache *files_cachep;
-
 #endif /* __LINUX_FILE_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index bf6b8a61f8d..b24c2875aa0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX__INIT_TASK_H
 #define _LINUX__INIT_TASK_H
 
-#include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/rcupdate.h>
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index d3ad54677f9..1510f78a0ff 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -19,6 +19,7 @@
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
 #include <linux/pid_namespace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 2bb675af4de..933e60ebcca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -22,6 +22,7 @@
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index e2764047ec0..8df97d3dfda 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -27,6 +27,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/completion.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/mount.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 1b50a6ebc55..1c864c0efe2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -39,6 +39,7 @@
 #include <linux/spinlock.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/ext2_fs.h>
-- 
cgit v1.2.3-70-g09d2