aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c45
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/compat.c9
-rw-r--r--kernel/cpuset.c226
-rw-r--r--kernel/exit.c26
-rw-r--r--kernel/fork.c104
-rw-r--r--kernel/futex.c137
-rw-r--r--kernel/intermodule.c3
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c14
-rw-r--r--kernel/kprobes.c94
-rw-r--r--kernel/module.c44
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/posix-timers.c28
-rw-r--r--kernel/power/Kconfig15
-rw-r--r--kernel/power/disk.c55
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/pm.c3
-rw-r--r--kernel/power/process.c29
-rw-r--r--kernel/power/swsusp.c202
-rw-r--r--kernel/printk.c13
-rw-r--r--kernel/ptrace.c41
-rw-r--r--kernel/rcupdate.c14
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/sched.c617
-rw-r--r--kernel/signal.c86
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c151
-rw-r--r--kernel/spinlock.c15
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/timer.c55
-rw-r--r--kernel/workqueue.c5
35 files changed, 1580 insertions, 486 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d23..ff4dc02ce17 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868..b756f527497 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -165,7 +165,7 @@ out:
}
/*
- * Close the old accouting file (if currently open) and then replace
+ * Close the old accounting file (if currently open) and then replace
* it with file (if non-NULL).
*
* NOTE: acct_globals.lock MUST be held on entry and exit.
@@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file)
}
}
-/*
- * sys_acct() is the only system call needed to implement process
- * accounting. It takes the name of the file where accounting records
- * should be written. If the filename is NULL, accounting will be
- * shutdown.
+/**
+ * sys_acct - enable/disable process accounting
+ * @name: file name for accounting records or NULL to shutdown accounting
+ *
+ * Returns 0 for success or negative errno values for failure.
+ *
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
*/
asmlinkage long sys_acct(const char __user *name)
{
@@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name)
return (PTR_ERR(tmp));
}
/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+ file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
putname(tmp);
if (IS_ERR(file)) {
return (PTR_ERR(file));
@@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name)
return (0);
}
-/*
- * If the accouting is turned on for a file in the filesystem pointed
- * to by sb, turn accouting off.
+/**
+ * acct_auto_close - turn off a filesystem's accounting if it is on
+ * @sb: super block for the filesystem
+ *
+ * If the accounting is turned on for a file in the filesystem pointed
+ * to by sb, turn accounting off.
*/
void acct_auto_close(struct super_block *sb)
{
@@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file)
set_fs(fs);
}
-/*
+/**
* acct_process - now just a wrapper around do_acct_process
+ * @exitcode: task exit code
+ *
+ * handles process accounting for an exiting task
*/
void acct_process(long exitcode)
{
@@ -530,9 +541,9 @@ void acct_process(long exitcode)
}
-/*
- * acct_update_integrals
- * - update mm integral fields in task_struct
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
*/
void acct_update_integrals(struct task_struct *tsk)
{
@@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk)
}
}
-/*
- * acct_clear_integrals
- * - clear the mm integral fields in task_struct
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
*/
void acct_clear_integrals(struct task_struct *tsk)
{
diff --git a/kernel/audit.c b/kernel/audit.c
index 8376ec10cf2..83096b67510 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -513,7 +513,8 @@ static int __init audit_init(void)
{
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
- audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+ audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
+ THIS_MODULE);
if (!audit_sock)
audit_panic("cannot initialize netlink socket");
diff --git a/kernel/compat.c b/kernel/compat.c
index ddfcaaa8662..102296e21ea 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
if (!time_after(expire, now))
return 0;
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire - now);
+ expire = schedule_timeout_interruptible(expire - now);
if (expire == 0)
return 0;
@@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
return -EINVAL;
expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire);
+ expire = schedule_timeout_interruptible(expire);
if (expire == 0)
return 0;
@@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
- current->state = TASK_INTERRUPTIBLE;
- timeout = schedule_timeout(timeout);
+ timeout = schedule_timeout_interruptible(timeout);
spin_lock_irq(&current->sighand->siglock);
sig = dequeue_signal(current, &s, &info);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b..79866bc6b3a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL;
*/
static DECLARE_MUTEX(cpuset_sem);
+static struct task_struct *cpuset_sem_owner;
+static int cpuset_sem_depth;
+
+/*
+ * The global cpuset semaphore cpuset_sem can be needed by the
+ * memory allocator to update a tasks mems_allowed (see the calls
+ * to cpuset_update_current_mems_allowed()) or to walk up the
+ * cpuset hierarchy to find a mem_exclusive cpuset see the calls
+ * to cpuset_excl_nodes_overlap()).
+ *
+ * But if the memory allocation is being done by cpuset.c code, it
+ * usually already holds cpuset_sem. Double tripping on a kernel
+ * semaphore deadlocks the current task, and any other task that
+ * subsequently tries to obtain the lock.
+ *
+ * Run all up's and down's on cpuset_sem through the following
+ * wrappers, which will detect this nested locking, and avoid
+ * deadlocking.
+ */
+
+static inline void cpuset_down(struct semaphore *psem)
+{
+ if (cpuset_sem_owner != current) {
+ down(psem);
+ cpuset_sem_owner = current;
+ }
+ cpuset_sem_depth++;
+}
+
+static inline void cpuset_up(struct semaphore *psem)
+{
+ if (--cpuset_sem_depth == 0) {
+ cpuset_sem_owner = NULL;
+ up(psem);
+ }
+}
/*
* A couple of forward declarations required, due to cyclic reference loop:
@@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
* Refresh current tasks mems_allowed and mems_generation from
* current tasks cpuset. Call with cpuset_sem held.
*
- * Be sure to call refresh_mems() on any cpuset operation which
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
- * Call after obtaining cpuset_sem lock, before any possible
- * allocation. Otherwise one risks trying to allocate memory
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem. Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
+ * This routine is needed to update the per-task mems_allowed
+ * data, within the tasks context, when it is trying to allocate
+ * memory (in various mm/mempolicy.c routines) and notices
+ * that some other task has been modifying its cpuset.
*/
static void refresh_mems(void)
@@ -628,13 +655,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
* lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
*/
-/*
- * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
- * Disable letting 'cpu_exclusive' cpusets define dynamic sched
- * domains, until the sched domain can handle partial nodes.
- * Remove this #if hackery when sched domains fixed.
- */
-#if 0
static void update_cpu_domains(struct cpuset *cur)
{
struct cpuset *c, *par = cur->parent;
@@ -675,11 +695,6 @@ static void update_cpu_domains(struct cpuset *cur)
partition_sched_domains(&pspan, &cspan);
unlock_cpu_hotplug();
}
-#else
-static void update_cpu_domains(struct cpuset *cur)
-{
-}
-#endif
static int update_cpumask(struct cpuset *cs, char *buf)
{
@@ -852,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
}
buffer[nbytes] = 0; /* nul-terminate */
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
if (is_removed(cs)) {
retval = -ENODEV;
@@ -886,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
if (retval == 0)
retval = nbytes;
out2:
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
cpuset_release_agent(pathbuf);
out1:
kfree(buffer);
@@ -926,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
{
cpumask_t mask;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
mask = cs->cpus_allowed;
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return cpulist_scnprintf(page, PAGE_SIZE, mask);
}
@@ -937,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
nodemask_t mask;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
mask = cs->mems_allowed;
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return nodelist_scnprintf(page, PAGE_SIZE, mask);
}
@@ -984,6 +999,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
*s++ = '\n';
*s = '\0';
+ /* Do nothing if *ppos is at the eof or beyond the eof. */
+ if (s - page <= *ppos)
+ return 0;
+
start = page + *ppos;
n = s - start;
retval = n - copy_to_user(buf, start, min(n, nbytes));
@@ -1342,8 +1361,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
if (!cs)
return -ENOMEM;
- down(&cpuset_sem);
- refresh_mems();
+ cpuset_down(&cpuset_sem);
cs->flags = 0;
if (notify_on_release(parent))
set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1368,14 +1386,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
* will down() this new directory's i_sem and if we race with
* another mkdir, we might deadlock.
*/
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
err = cpuset_populate_dir(cs->dentry);
/* If err < 0, we have a half-filled directory - oh well ;) */
return 0;
err:
list_del(&cs->sibling);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
kfree(cs);
return err;
}
@@ -1397,14 +1415,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
/* the vfs holds both inode->i_sem already */
- down(&cpuset_sem);
- refresh_mems();
+ cpuset_down(&cpuset_sem);
if (atomic_read(&cs->count) > 0) {
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return -EBUSY;
}
if (!list_empty(&cs->children)) {
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return -EBUSY;
}
parent = cs->parent;
@@ -1420,7 +1437,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
spin_unlock(&d->d_lock);
cpuset_d_remove_dir(d);
dput(d);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
cpuset_release_agent(pathbuf);
return 0;
}
@@ -1523,10 +1540,10 @@ void cpuset_exit(struct task_struct *tsk)
if (notify_on_release(cs)) {
char *pathbuf = NULL;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
if (atomic_dec_and_test(&cs->count))
check_for_release(cs, &pathbuf);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
cpuset_release_agent(pathbuf);
} else {
atomic_dec(&cs->count);
@@ -1547,11 +1564,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
{
cpumask_t mask;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
task_lock((struct task_struct *)tsk);
guarantee_online_cpus(tsk->cpuset, &mask);
task_unlock((struct task_struct *)tsk);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return mask;
}
@@ -1576,9 +1593,9 @@ void cpuset_update_current_mems_allowed(void)
if (!cs)
return; /* task is exiting */
if (current->cpuset_mems_generation != cs->mems_generation) {
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
refresh_mems();
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
}
}
@@ -1611,17 +1628,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
return 0;
}
+/*
+ * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * ancestor to the specified cpuset. Call while holding cpuset_sem.
+ * If no ancestor is mem_exclusive (an unusual configuration), then
+ * returns the root cpuset.
+ */
+static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+{
+ while (!is_mem_exclusive(cs) && cs->parent)
+ cs = cs->parent;
+ return cs;
+}
+
/**
- * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
- * @z: zone in question
+ * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
*
- * Is zone z allowed in current->mems_allowed, or is
- * the CPU in interrupt context? (zone is always allowed in this case)
- */
-int cpuset_zone_allowed(struct zone *z)
+ * If we're in interrupt, yes, we can always allocate. If zone
+ * z's node is in our tasks mems_allowed, yes. If it's not a
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
+ * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest mem_exclusive ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages()
+ * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * mems_allowed came up empty on the first pass over the zonelist.
+ * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * short of memory, might require taking the cpuset_sem semaphore.
+ *
+ * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * hardwall cpusets - no allocation on a node outside the cpuset is
+ * allowed (unless in interrupt, of course).
+ *
+ * The second loop doesn't even call here for GFP_ATOMIC requests
+ * (if the __alloc_pages() local variable 'wait' is set). That check
+ * and the checks below have the combined affect in the second loop of
+ * the __alloc_pages() routine that:
+ * in_interrupt - any node ok (current task context irrelevant)
+ * GFP_ATOMIC - any node ok
+ * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
+ * GFP_USER - only nodes in current tasks mems allowed ok.
+ **/
+
+int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
+{
+ int node; /* node that zone z is on */
+ const struct cpuset *cs; /* current cpuset ancestors */
+ int allowed = 1; /* is allocation in zone z allowed? */
+
+ if (in_interrupt())
+ return 1;
+ node = z->zone_pgdat->node_id;
+ if (node_isset(node, current->mems_allowed))
+ return 1;
+ if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
+ return 0;
+
+ /* Not hardwall and node outside mems_allowed: scan up cpusets */
+ cpuset_down(&cpuset_sem);
+ cs = current->cpuset;
+ if (!cs)
+ goto done; /* current task exiting */
+ cs = nearest_exclusive_ancestor(cs);
+ allowed = node_isset(node, cs->mems_allowed);
+done:
+ cpuset_up(&cpuset_sem);
+ return allowed;
+}
+
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap. Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires cpuset_sem - not suitable for calling from a fast path.
+ **/
+
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
{
- return in_interrupt() ||
- node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+ const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
+ int overlap = 0; /* do cpusets overlap? */
+
+ cpuset_down(&cpuset_sem);
+ cs1 = current->cpuset;
+ if (!cs1)
+ goto done; /* current task exiting */
+ cs2 = p->cpuset;
+ if (!cs2)
+ goto done; /* task p is exiting */
+ cs1 = nearest_exclusive_ancestor(cs1);
+ cs2 = nearest_exclusive_ancestor(cs2);
+ overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+ cpuset_up(&cpuset_sem);
+
+ return overlap;
}
/*
@@ -1642,7 +1756,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
return -ENOMEM;
tsk = m->private;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
task_lock(tsk);
cs = tsk->cpuset;
task_unlock(tsk);
@@ -1657,7 +1771,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
seq_puts(m, buf);
seq_putc(m, '\n');
out:
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
kfree(buf);
return retval;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b0fb9f09f2..6d2089a1bce 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize);
static inline void close_files(struct files_struct * files)
{
int i, j;
+ struct fdtable *fdt;
j = 0;
+ fdt = files_fdtable(files);
for (;;) {
unsigned long set;
i = j * __NFDBITS;
- if (i >= files->max_fdset || i >= files->max_fds)
+ if (i >= fdt->max_fdset || i >= fdt->max_fds)
break;
- set = files->open_fds->fds_bits[j++];
+ set = fdt->open_fds->fds_bits[j++];
while (set) {
if (set & 1) {
- struct file * file = xchg(&files->fd[i], NULL);
+ struct file * file = xchg(&fdt->fd[i], NULL);
if (file)
filp_close(file, files);
}
@@ -403,18 +405,22 @@ struct files_struct *get_files_struct(struct task_struct *task)
void fastcall put_files_struct(struct files_struct *files)
{
+ struct fdtable *fdt;
+
if (atomic_dec_and_test(&files->count)) {
close_files(files);
/*
* Free the fd and fdset arrays if we expanded them.
+ * If the fdtable was embedded, pass files for freeing
+ * at the end of the RCU grace period. Otherwise,
+ * you can free files immediately.
*/
- if (files->fd != &files->fd_array[0])
- free_fd_array(files->fd, files->max_fds);
- if (files->max_fdset > __FD_SETSIZE) {
- free_fdset(files->open_fds, files->max_fdset);
- free_fdset(files->close_on_exec, files->max_fdset);
- }
- kmem_cache_free(files_cachep, files);
+ fdt = files_fdtable(files);
+ if (fdt == &files->fdtab)
+ fdt->free_files = files;
+ else
+ kmem_cache_free(files_cachep, files);
+ free_fdtable(fdt);
}
}
diff --git a/kernel/fork.c b/kernel/fork.c
index b65187f0c74..8149f360288 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
+#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
@@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
+ atomic_set(&tsk->fs_excl, 0);
return tsk;
}
@@ -564,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
return 0;
}
-static int count_open_files(struct files_struct *files, int size)
+static int count_open_files(struct fdtable *fdt)
{
+ int size = fdt->max_fdset;
int i;
/* Find the last open fd */
for (i = size/(8*sizeof(long)); i > 0; ) {
- if (files->open_fds->fds_bits[--i])
+ if (fdt->open_fds->fds_bits[--i])
break;
}
i = (i+1) * 8 * sizeof(long);
return i;
}
+static struct files_struct *alloc_files(void)
+{
+ struct files_struct *newf;
+ struct fdtable *fdt;
+
+ newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+ if (!newf)
+ goto out;
+
+ atomic_set(&newf->count, 1);
+
+ spin_lock_init(&newf->file_lock);
+ fdt = &newf->fdtab;
+ fdt->next_fd = 0;
+ fdt->max_fds = NR_OPEN_DEFAULT;
+ fdt->max_fdset = __FD_SETSIZE;
+ fdt->close_on_exec = &newf->close_on_exec_init;
+ fdt->open_fds = &newf->open_fds_init;
+ fdt->fd = &newf->fd_array[0];
+ INIT_RCU_HEAD(&fdt->rcu);
+ fdt->free_files = NULL;
+ fdt->next = NULL;
+ rcu_assign_pointer(newf->fdt, fdt);
+out:
+ return newf;
+}
+
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
struct files_struct *oldf, *newf;
struct file **old_fds, **new_fds;
int open_files, size, i, error = 0, expand;
+ struct fdtable *old_fdt, *new_fdt;
/*
* A background process may not have any files ...
@@ -602,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
*/
tsk->files = NULL;
error = -ENOMEM;
- newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
- if (!newf)
+ newf = alloc_files();
+ if (!newf)
goto out;
- atomic_set(&newf->count, 1);
-
- spin_lock_init(&newf->file_lock);
- newf->next_fd = 0;
- newf->max_fds = NR_OPEN_DEFAULT;
- newf->max_fdset = __FD_SETSIZE;
- newf->close_on_exec = &newf->close_on_exec_init;
- newf->open_fds = &newf->open_fds_init;
- newf->fd = &newf->fd_array[0];
-
spin_lock(&oldf->file_lock);
-
- open_files = count_open_files(oldf, oldf->max_fdset);
+ old_fdt = files_fdtable(oldf);
+ new_fdt = files_fdtable(newf);
+ size = old_fdt->max_fdset;
+ open_files = count_open_files(old_fdt);
expand = 0;
/*
* Check whether we need to allocate a larger fd array or fd set.
* Note: we're not a clone task, so the open count won't change.
*/
- if (open_files > newf->max_fdset) {
- newf->max_fdset = 0;
+ if (open_files > new_fdt->max_fdset) {
+ new_fdt->max_fdset = 0;
expand = 1;
}
- if (open_files > newf->max_fds) {
- newf->max_fds = 0;
+ if (open_files > new_fdt->max_fds) {
+ new_fdt->max_fds = 0;
expand = 1;
}
@@ -642,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
spin_unlock(&newf->file_lock);
if (error < 0)
goto out_release;
+ new_fdt = files_fdtable(newf);
+ /*
+ * Reacquire the oldf lock and a pointer to its fd table
+ * who knows it may have a new bigger fd table. We need
+ * the latest pointer.
+ */
spin_lock(&oldf->file_lock);
+ old_fdt = files_fdtable(oldf);
}
- old_fds = oldf->fd;
- new_fds = newf->fd;
+ old_fds = old_fdt->fd;
+ new_fds = new_fdt->fd;
- memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
- memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+ memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
+ memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
@@ -662,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
* is partway through open(). So make sure that this
* fd is available to the new process.
*/
- FD_CLR(open_files - i, newf->open_fds);
+ FD_CLR(open_files - i, new_fdt->open_fds);
}
- *new_fds++ = f;
+ rcu_assign_pointer(*new_fds++, f);
}
spin_unlock(&oldf->file_lock);
/* compute the remainder to be cleared */
- size = (newf->max_fds - open_files) * sizeof(struct file *);
+ size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
/* This is long word aligned thus could use a optimized version */
memset(new_fds, 0, size);
- if (newf->max_fdset > open_files) {
- int left = (newf->max_fdset-open_files)/8;
+ if (new_fdt->max_fdset > open_files) {
+ int left = (new_fdt->max_fdset-open_files)/8;
int start = open_files / (8 * sizeof(unsigned long));
- memset(&newf->open_fds->fds_bits[start], 0, left);
- memset(&newf->close_on_exec->fds_bits[start], 0, left);
+ memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+ memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
}
tsk->files = newf;
@@ -688,9 +718,9 @@ out:
return error;
out_release:
- free_fdset (newf->close_on_exec, newf->max_fdset);
- free_fdset (newf->open_fds, newf->max_fdset);
- free_fd_array(newf->fd, newf->max_fds);