diff options
author | Paul Mackerras <paulus@samba.org> | 2005-10-31 13:37:12 +1100 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2005-10-31 13:37:12 +1100 |
commit | 23fd07750a789a66fe88cf173d52a18f1a387da4 (patch) | |
tree | 06fdd6df35fdb835abdaa9b754d62f6b84b97250 /kernel | |
parent | bd787d438a59266af3c9f6351644c85ef1dd21fe (diff) | |
parent | ed28f96ac1960f30f818374d65be71d2fdf811b0 (diff) |
Merge ../linux-2.6 by hand
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/acct.c | 2 | ||||
-rw-r--r-- | kernel/audit.c | 6 | ||||
-rw-r--r-- | kernel/auditsc.c | 2 | ||||
-rw-r--r-- | kernel/cpu.c | 1 | ||||
-rw-r--r-- | kernel/cpuset.c | 466 | ||||
-rw-r--r-- | kernel/exit.c | 34 | ||||
-rw-r--r-- | kernel/fork.c | 31 | ||||
-rw-r--r-- | kernel/futex.c | 6 | ||||
-rw-r--r-- | kernel/kallsyms.c | 1 | ||||
-rw-r--r-- | kernel/kexec.c | 11 | ||||
-rw-r--r-- | kernel/kmod.c | 6 | ||||
-rw-r--r-- | kernel/kprobes.c | 1 | ||||
-rw-r--r-- | kernel/kthread.c | 13 | ||||
-rw-r--r-- | kernel/params.c | 1 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 16 | ||||
-rw-r--r-- | kernel/posix-timers.c | 19 | ||||
-rw-r--r-- | kernel/power/Makefile | 2 | ||||
-rw-r--r-- | kernel/power/disk.c | 22 | ||||
-rw-r--r-- | kernel/power/main.c | 5 | ||||
-rw-r--r-- | kernel/power/power.h | 17 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 435 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 569 | ||||
-rw-r--r-- | kernel/printk.c | 78 | ||||
-rw-r--r-- | kernel/ptrace.c | 7 | ||||
-rw-r--r-- | kernel/rcupdate.c | 10 | ||||
-rw-r--r-- | kernel/rcutorture.c | 492 | ||||
-rw-r--r-- | kernel/sched.c | 3 | ||||
-rw-r--r-- | kernel/signal.c | 150 | ||||
-rw-r--r-- | kernel/time.c | 26 | ||||
-rw-r--r-- | kernel/timer.c | 337 | ||||
-rw-r--r-- | kernel/workqueue.c | 33 |
32 files changed, 1661 insertions, 1143 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ff4dc02ce17..4f5a1453093 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o @@ -32,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index b756f527497..2e3f4a47e7d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk) if (delta == 0) return; tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; } } diff --git a/kernel/audit.c b/kernel/audit.c index aefa73a8a58..0c56320d38d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -133,7 +133,7 @@ struct audit_buffer { struct list_head list; struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ - int gfp_mask; + gfp_t gfp_mask; }; static void audit_set_pid(struct audit_buffer *ab, pid_t pid) @@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, * will be written at syscall exit. If there is no associated task, tsk * should be NULL. */ -struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, +struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab = NULL; @@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab) /* Log an audit record. This is a convenience function that calls * audit_log_start, audit_log_vformat, and audit_log_end. It may be * called in any context. */ -void audit_log(struct audit_context *ctx, int gfp_mask, int type, +void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { struct audit_buffer *ab; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 88696f639aa..d8a68509e72 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab) up_read(&mm->mmap_sem); } -static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask) +static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) { int i; struct audit_buffer *ab; diff --git a/kernel/cpu.c b/kernel/cpu.c index 53d8263ae12..3619e939182 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -17,6 +17,7 @@ /* This protects CPUs going up and down... */ DECLARE_MUTEX(cpucontrol); +EXPORT_SYMBOL_GPL(cpucontrol); static struct notifier_block *cpu_chain; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 28176d083f7..5a737ed9dac 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -32,6 +32,7 @@ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/list.h> +#include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mount.h> @@ -60,6 +61,9 @@ struct cpuset { cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + /* + * Count is atomic so can incr (fork) or decr (exit) without a lock. + */ atomic_t count; /* count tasks using this cpuset */ /* @@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount; static struct super_block *cpuset_sb = NULL; /* - * cpuset_sem should be held by anyone who is depending on the children - * or sibling lists of any cpuset, or performing non-atomic operations - * on the flags or *_allowed values of a cpuset, such as raising the - * CS_REMOVED flag bit iff it is not already raised, or reading and - * conditionally modifying the *_allowed values. One kernel global - * cpuset semaphore should be sufficient - these things don't change - * that much. - * - * The code that modifies cpusets holds cpuset_sem across the entire - * operation, from cpuset_common_file_write() down, single threading - * all cpuset modifications (except for counter manipulations from - * fork and exit) across the system. This presumes that cpuset - * modifications are rare - better kept simple and safe, even if slow. - * - * The code that reads cpusets, such as in cpuset_common_file_read() - * and below, only holds cpuset_sem across small pieces of code, such - * as when reading out possibly multi-word cpumasks and nodemasks, as - * the risks are less, and the desire for performance a little greater. - * The proc_cpuset_show() routine needs to hold cpuset_sem to insure - * that no cs->dentry is NULL, as it walks up the cpuset tree to root. - * - * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't - * (usually) grab cpuset_sem. These are the two most performance - * critical pieces of code here. The exception occurs on exit(), - * when a task in a notify_on_release cpuset exits. Then cpuset_sem + * We have two global cpuset semaphores below. They can nest. + * It is ok to first take manage_sem, then nest callback_sem. We also + * require taking task_lock() when dereferencing a tasks cpuset pointer. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold both semaphores to modify cpusets. If a task + * holds manage_sem, then it blocks others wanting that semaphore, + * ensuring that it is the only task able to also acquire callback_sem + * and be able to modify cpusets. It can perform various checks on + * the cpuset structure first, knowing nothing will change. It can + * also allocate memory while just holding manage_sem. While it is + * performing these checks, various callback routines can briefly + * acquire callback_sem to query cpusets. Once it is ready to make + * the changes, it takes callback_sem, blocking everyone else. + * + * Calls to the kernel memory allocator can not be made while holding + * callback_sem, as that would risk double tripping on callback_sem + * from one of the callbacks into the cpuset code from within + * __alloc_pages(). + * + * If a task is only holding callback_sem, then it has read-only + * access to cpusets. + * + * The task_struct fields mems_allowed and mems_generation may only + * be accessed in the context of that task, so require no locks. + * + * Any task can increment and decrement the count field without lock. + * So in general, code holding manage_sem or callback_sem can't rely + * on the count field not changing. However, if the count goes to + * zero, then only attach_task(), which holds both semaphores, can + * increment it again. Because a count of zero means that no tasks + * are currently attached, therefore there is no way a task attached + * to that cpuset can fork (the other way to increment the count). + * So code holding manage_sem or callback_sem can safely assume that + * if the count is zero, it will stay zero. Similarly, if a task + * holds manage_sem or callback_sem on a cpuset with zero count, it + * knows that the cpuset won't be removed, as cpuset_rmdir() needs + * both of those semaphores. + * + * A possible optimization to improve parallelism would be to make + * callback_sem a R/W semaphore (rwsem), allowing the callback routines + * to proceed in parallel, with read access, until the holder of + * manage_sem needed to take this rwsem for exclusive write access + * and modify some cpusets. + * + * The cpuset_common_file_write handler for operations that modify + * the cpuset hierarchy holds manage_sem across the entire operation, + * single threading all such cpuset modifications across the system. + * + * The cpuset_common_file_read() handlers only hold callback_sem across + * small pieces of code, such as when reading out possibly multi-word + * cpumasks and nodemasks. + * + * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't + * (usually) take either semaphore. These are the two most performance + * critical pieces of code here. The exception occurs on cpuset_exit(), + * when a task in a notify_on_release cpuset exits. Then manage_sem * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * - * A cpuset can only be deleted if both its 'count' of using tasks is - * zero, and its list of 'children' cpusets is empty. Since all tasks - * in the system use _some_ cpuset, and since there is always at least - * one task in the system (init, pid == 1), therefore, top_cpuset - * always has either children cpusets and/or using tasks. So no need - * for any special hack to ensure that top_cpuset cannot be deleted. + * A cpuset can only be deleted if both its 'count' of using tasks + * is zero, and its list of 'children' cpusets is empty. Since all + * tasks in the system use _some_ cpuset, and since there is always at + * least one task in the system (init, pid == 1), therefore, top_cpuset + * always has either children cpusets and/or using tasks. So we don't + * need a special hack to ensure that top_cpuset cannot be deleted. + * + * The above "Tale of Two Semaphores" would be complete, but for: + * + * The task_lock() exception + * + * The need for this exception arises from the action of attach_task(), + * which overwrites one tasks cpuset pointer with another. It does + * so using both semaphores, however there are several performance + * critical places that need to reference task->cpuset without the + * expense of grabbing a system global semaphore. Therefore except as + * noted below, when dereferencing or, as in attach_task(), modifying + * a tasks cpuset pointer we use task_lock(), which acts on a spinlock + * (task->alloc_lock) already in the task_struct routinely used for + * such matters. */ -static DECLARE_MUTEX(cpuset_sem); -static struct task_struct *cpuset_sem_owner; -static int cpuset_sem_depth; - -/* - * The global cpuset semaphore cpuset_sem can be needed by the - * memory allocator to update a tasks mems_allowed (see the calls - * to cpuset_update_current_mems_allowed()) or to walk up the - * cpuset hierarchy to find a mem_exclusive cpuset see the calls - * to cpuset_excl_nodes_overlap()). - * - * But if the memory allocation is being done by cpuset.c code, it - * usually already holds cpuset_sem. Double tripping on a kernel - * semaphore deadlocks the current task, and any other task that - * subsequently tries to obtain the lock. - * - * Run all up's and down's on cpuset_sem through the following - * wrappers, which will detect this nested locking, and avoid - * deadlocking. - */ - -static inline void cpuset_down(struct semaphore *psem) -{ - if (cpuset_sem_owner != current) { - down(psem); - cpuset_sem_owner = current; - } - cpuset_sem_depth++; -} - -static inline void cpuset_up(struct semaphore *psem) -{ - if (--cpuset_sem_depth == 0) { - cpuset_sem_owner = NULL; - up(psem); - } -} +static DECLARE_MUTEX(manage_sem); +static DECLARE_MUTEX(callback_sem); /* * A couple of forward declarations required, due to cyclic reference loop: @@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) } /* - * Call with cpuset_sem held. Writes path of cpuset into buf. + * Call with manage_sem held. Writes path of cpuset into buf. * Returns 0 on success, -errno on error. */ @@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * status of the /sbin/cpuset_release_agent task, so no sense holding * our caller up for that. * - * The simple act of forking that task might require more memory, - * which might need cpuset_sem. So this routine must be called while - * cpuset_sem is not held, to avoid a possible deadlock. See also - * comments for check_for_release(), below. + * When we had only one cpuset semaphore, we had to call this + * without holding it, to avoid deadlock when call_usermodehelper() + * allocated memory. With two locks, we could now call this while + * holding manage_sem, but we still don't, so as to minimize + * the time manage_sem is held. */ static void cpuset_release_agent(const char *pathbuf) @@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf) * cs is notify_on_release() and now both the user count is zero and * the list of children is empty, prepare cpuset path in a kmalloc'd * buffer, to be returned via ppathbuf, so that the caller can invoke - * cpuset_release_agent() with it later on, once cpuset_sem is dropped. - * Call here with cpuset_sem held. + * cpuset_release_agent() with it later on, once manage_sem is dropped. + * Call here with manage_sem held. * * This check_for_release() routine is responsible for kmalloc'ing * pathbuf. The above cpuset_release_agent() is responsible for * kfree'ing pathbuf. The caller of these routines is responsible * for providing a pathbuf pointer, initialized to NULL, then - * calling check_for_release() with cpuset_sem held and the address - * of the pathbuf pointer, then dropping cpuset_sem, then calling + * calling check_for_release() with manage_sem held and the address + * of the pathbuf pointer, then dropping manage_sem, then calling * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ @@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) * One way or another, we guarantee to return some non-empty subset * of cpu_online_map. * - * Call with cpuset_sem held. + * Call with callback_sem held. */ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) @@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) * One way or another, we guarantee to return some non-empty subset * of node_online_map. * - * Call with cpuset_sem held. + * Call with callback_sem held. */ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) @@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) } /* - * Refresh current tasks mems_allowed and mems_generation from - * current tasks cpuset. Call with cpuset_sem held. + * Refresh current tasks mems_allowed and mems_generation from current + * tasks cpuset. * - * This routine is needed to update the per-task mems_allowed - * data, within the tasks context, when it is trying to allocate - * memory (in various mm/mempolicy.c routines) and notices - * that some other task has been modifying its cpuset. + * Call without callback_sem or task_lock() held. May be called with + * or without manage_sem held. Will acquire task_lock() and might + * acquire callback_sem during call. + * + * The task_lock() is required to dereference current->cpuset safely. + * Without it, we could pick up the pointer value of current->cpuset + * in one instruction, and then attach_task could give us a different + * cpuset, and then the cpuset we had could be removed and freed, + * and then on our next instruction, we could dereference a no longer + * valid cpuset pointer to get its mems_generation field. + * + * This routine is needed to update the per-task mems_allowed data, + * within the tasks context, when it is trying to allocate memory + * (in various mm/mempolicy.c routines) and notices that some other + * task has been modifying its cpuset. */ static void refresh_mems(void) { - struct cpuset *cs = current->cpuset; + int my_cpusets_mem_gen; + + task_lock(current); + my_cpusets_mem_gen = current->cpuset->mems_generation; + task_unlock(current); - if (current->cpuset_mems_generation != cs->mems_generation) { + if (current->cpuset_mems_generation != my_cpusets_mem_gen) { + struct cpuset *cs; + nodemask_t oldmem = current->mems_allowed; + + down(&callback_sem); + task_lock(current); + cs = current->cpuset; guarantee_online_mems(cs, ¤t->mems_allowed); current->cpuset_mems_generation = cs->mems_generation; + task_unlock(current); + up(&callback_sem); + if (!nodes_equal(oldmem, current->mems_allowed)) + numa_policy_rebind(&oldmem, ¤t->mems_allowed); } } @@ -579,7 +620,7 @@ static void refresh_mems(void) * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. + * are only set if the other's are set. Call holding manage_sem. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_sem held. + * manage_sem held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * exclusive child cpusets * Build these two partitions by calling partition_sched_domains * - * Call with cpuset_sem held. May nest a call to the + * Call with manage_sem held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ @@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur) unlock_cpu_hotplug(); } +/* + * Call with manage_sem held. May take callback_sem during call. + */ + static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; @@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (retval < 0) return retval; cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); + down(&callback_sem); cs->cpus_allowed = trialcs.cpus_allowed; + up(&callback_sem); if (is_cpu_exclusive(cs) && !cpus_unchanged) update_cpu_domains(cs); return 0; } +/* + * Call with manage_sem held. May take callback_sem during call. + */ + static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; @@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval == 0) { + down(&callback_sem); cs->mems_allowed = trialcs.mems_allowed; atomic_inc(&cpuset_mems_generation); cs->mems_generation = atomic_read(&cpuset_mems_generation); + up(&callback_sem); } return retval; } @@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) * CS_NOTIFY_ON_RELEASE) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 + * + * Call with manage_sem held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) @@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return err; cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); + down(&callback_sem); if (turning_on) set_bit(bit, &cs->flags); else clear_bit(bit, &cs->flags); + up(&callback_sem); if (cpu_exclusive_changed) update_cpu_domains(cs); return 0; } +/* + * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly + * writing the path of the old cpuset in 'ppathbuf' if it needs to be + * notified on release. + * + * Call holding manage_sem. May take callback_sem and task_lock of + * the task 'pid' during call. + */ + static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) { pid_t pid; @@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); - if (!tsk) { + if (!tsk || tsk->flags & PF_EXITING) { read_unlock(&tasklist_lock); return -ESRCH; } @@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } + down(&callback_sem); + task_lock(tsk); oldcs = tsk->cpuset; if (!oldcs) { task_unlock(tsk); + up(&callback_sem); put_task_struct(tsk); return -ESRCH; } @@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) guarantee_online_cpus(cs, &cpus); set_cpus_allowed(tsk, cpus); + up(&callback_sem); put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) check_for_release(oldcs, ppathbuf); @@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us } buffer[nbytes] = 0; /* nul-terminate */ - cpuset_down(&cpuset_sem); + down(&manage_sem); if (is_removed(cs)) { retval = -ENODEV; @@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us if (retval == 0) retval = nbytes; out2: - cpuset_up(&cpuset_sem); + up(&manage_sem); cpuset_release_agent(pathbuf); out1: kfree(buffer); @@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { cpumask_t mask; - cpuset_down(&cpuset_sem); + down(&callback_sem); mask = cs->cpus_allowed; - cpuset_up(&cpuset_sem); + up(&callback_sem); return cpulist_scnprintf(page, PAGE_SIZE, mask); } @@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { nodemask_t mask; - cpuset_down(&cpuset_sem); + down(&callback_sem); mask = cs->mems_allowed; - cpuset_up(&cpuset_sem); + up(&callback_sem); return nodelist_scnprintf(page, PAGE_SIZE, mask); } @@ -995,7 +1065,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, goto out; } *s++ = '\n'; - *s = '\0'; retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); out: @@ -1048,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file) return 0; } +/* + * cpuset_rename - Only allow simple rename of directories in place. + */ +static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (!S_ISDIR(old_dentry->d_inode->i_mode)) + return -ENOTDIR; + if (new_dentry->d_inode) + return -EEXIST; + if (old_dir != new_dir) + return -EIO; + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static struct file_operations cpuset_file_operations = { .read = cpuset_file_read, .write = cpuset_file_write, @@ -1060,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cpuset_mkdir, .rmdir = cpuset_rmdir, + .rename = cpuset_rename, }; static int cpuset_create_file(struct dentry *dentry, int mode) @@ -1163,7 +1248,9 @@ struct ctr_struct { /* * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. - * Return actual number of pids loaded. + * Return actual number of pids loaded. No need to task_lock(p) + * when reading out p->cpuset, as we don't really care if it changes + * on the next cycle, and we are not going to try to dereference it. */ static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) { @@ -1205,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) return cnt; } +/* + * Handle an open on 'tasks' file. Prepare a buffer listing the + * process id's of tasks currently attached to the cpuset being opened. + * + * Does not require any specific cpuset semaphores, and does not take any. + */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { struct cpuset *cs = __d_cs(file->f_dentry->d_parent); @@ -1352,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) if (!cs) return -ENOMEM; - cpuset_down(&cpuset_sem); + down(&manage_sem); + refresh_mems(); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); @@ -1366,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) cs->parent = parent; + down(&callback_sem); list_add(&cs->sibling, &cs->parent->children); + up(&callback_sem); err = cpuset_create_dir(cs, name, mode); if (err < 0) goto err; /* - * Release cpuset_sem before cpuset_populate_dir() because it + * Release manage_sem before cpuset_populate_dir() because it * will down() this new directory's i_sem and if we race with * another mkdir, we might deadlock. */ - cpuset_up(&cpuset_sem); + up(&manage_sem); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0; err: list_del(&cs->sibling); - cpuset_up(&cpuset_sem); + up(&manage_sem); kfree(cs); return err; } @@ -1406,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) /* the vfs holds both inode->i_sem already */ - cpuset_down(&cpuset_sem); + down(&manage_sem); + refresh_mems(); if (atomic_read(&cs->count) > 0) { - cpuset_up(&cpuset_sem); + up(&manage_sem); return -EBUSY; } if (!list_empty(&cs->children)) { - cpuset_up(&cpuset_sem); + up(&manage_sem); return -EBUSY; } parent = cs->parent; + down(&callback_sem); set_bit(CS_REMOVED, &cs->flags); if (is_cpu_exclusive(cs)) update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ - if (list_empty(&parent->children)) - check_for_release(parent, &pathbuf); spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); - cpuset_up(&cpuset_sem); + up(&callback_sem); + if (list_empty(&parent->children)) + check_for_release(parent, &pathbuf); + up(&manage_sem); cpuset_release_agent(pathbuf); return 0; } @@ -1488,16 +1587,26 @@ void __init cpuset_init_smp(void) * cpuset_fork - attach newly forked task to its parents cpuset. * @tsk: pointer to task_struct of forking parent process. * - * Description: By default, on fork, a task inherits its - * parent's cpuset. The pointer to the shared cpuset is - * automatically copied in fork.c by dup_task_struct(). - * This cpuset_fork() routine need only increment the usage - * counter in that cpuset. + * Description: A task inherits its parent's cpuset at fork(). + * + * A pointer to the shared cpuset was automatically copied in fork.c + * by dup_task_struct(). However, we ignore that copy, since it was + * not made under the protection of task_lock(), so might no longer be + * a valid cpuset pointer. attach_task() might have already changed + * current->cpuset, allowing the previously referenced cpuset to + * be removed and freed. Instead, we task_lock(current) and copy + * its present value of current->cpuset for our freshly forked child. + * + * At the point that cpuset_fork() is called, 'current' is the parent + * task, and the passed argument 'child' points to the child task. **/ -void cpuset_fork(struct task_struct *tsk) +void cpuset_fork(struct task_struct *child) { - atomic_inc(&tsk->cpuset->count); + task_lock(current); + child->cpuset = current->cpuset; + atomic_inc(&child->cpuset->count); + task_unlock(current); } /** @@ -1506,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk) * * Description: Detach cpuset from @tsk and release it. * - * Note that cpusets marked notify_on_release force every task - * in them to take the global cpuset_sem semaphore when exiting. - * This could impact scaling on very large systems. Be reluctant - * to use notify_on_release cpusets where very high task exit - * scaling is required on large systems. - * - * Don't even think about derefencing 'cs' after the cpuset use - * count goes to zero, except inside a critical section guarded - * by the cpuset_sem semaphore. If y |