diff options
Diffstat (limited to 'kernel')
50 files changed, 4681 insertions, 2024 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore new file mode 100644 index 00000000000..f2ab70073bd --- /dev/null +++ b/kernel/.gitignore @@ -0,0 +1,5 @@ +# +# Generated files +# +config_data.h +config_data.gz diff --git a/kernel/Makefile b/kernel/Makefile index 4f5a1453093..4ae0fbde815 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,15 +6,18 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ - rcupdate.o intermodule.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o + rcupdate.o extable.o params.o posix-timers.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + hrtimer.o +obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o @@ -29,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ -obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/acct.c b/kernel/acct.c index 6312d6bd43e..065d8b4e51e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -47,6 +47,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/acct.h> +#include <linux/capability.h> #include <linux/file.h> #include <linux/tty.h> #include <linux/security.h> @@ -427,6 +428,7 @@ static void do_acct_process(long exitcode, struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + unsigned long jiffies; /* * First check to see if there is enough free_space to continue @@ -467,12 +469,12 @@ static void do_acct_process(long exitcode, struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ( - current->signal->utime + - current->group_leader->utime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ( - current->signal->stime + - current->group_leader->stime)); + jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, + current->signal->utime)); + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); + jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, + current->signal->stime)); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; @@ -580,7 +582,8 @@ void acct_process(long exitcode) void acct_update_integrals(struct task_struct *tsk) { if (likely(tsk->mm)) { - long delta = tsk->stime - tsk->acct_stimexpd; + long delta = + cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; if (delta == 0) return; diff --git a/kernel/audit.c b/kernel/audit.c index 32fa03ad198..0a813d2883e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -42,8 +42,8 @@ */ #include <linux/init.h> -#include <asm/atomic.h> #include <asm/types.h> +#include <asm/atomic.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/err.h> @@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid) return old; } -int kauditd_thread(void *dummy) +static int kauditd_thread(void *dummy) { struct sk_buff *skb; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d8a68509e72..685c25175d9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -30,8 +30,8 @@ */ #include <linux/init.h> -#include <asm/atomic.h> #include <asm/types.h> +#include <asm/atomic.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mount.h> diff --git a/kernel/capability.c b/kernel/capability.c index 8986a37a67e..bfa3c92e16f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -7,6 +7,7 @@ * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ +#include <linux/capability.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/security.h> diff --git a/kernel/compat.c b/kernel/compat.c index 102296e21ea..1867290c37e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -514,6 +514,24 @@ static int put_compat_itimerspec(struct compat_itimerspec __user *dst, return 0; } +long compat_sys_timer_create(clockid_t which_clock, + struct compat_sigevent __user *timer_event_spec, + timer_t __user *created_timer_id) +{ + struct sigevent __user *event = NULL; + + if (timer_event_spec) { + struct sigevent kevent; + + event = compat_alloc_user_space(sizeof(*event)); + if (get_compat_sigevent(&kevent, timer_event_spec) || + copy_to_user(event, &kevent, sizeof(*event))) + return -EFAULT; + } + + return sys_timer_create(which_clock, event, created_timer_id); +} + long compat_sys_timer_settime(timer_t timer_id, int flags, struct compat_itimerspec __user *new, struct compat_itimerspec __user *old) @@ -649,8 +667,6 @@ int get_compat_sigevent(struct sigevent *event, ? -EFAULT : 0; } -/* timer_create is architecture specific because it needs sigevent conversion */ - long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, unsigned long bitmap_size) { @@ -855,3 +871,31 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) } #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ + +#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND +asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) +{ + sigset_t newset; + compat_sigset_t newset32; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&newset, &newset32); + sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + spin_lock_irq(¤t->sighand->siglock); + current->saved_sigmask = current->blocked; + current->blocked = newset; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_thread_flag(TIF_RESTORE_SIGMASK); + return -ERESTARTNOHAND; +} +#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ diff --git a/kernel/configs.c b/kernel/configs.c index 986f7af31e0..009e1ebdcb8 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -3,7 +3,7 @@ * Echo the kernel .config file used to build the kernel * * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> - * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org> + * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net> * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> * Copyright (C) 2002 Hewlett-Packard Company * diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7430640f981..fe2f71f92ae 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -39,6 +39,7 @@ #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> +#include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -54,7 +55,23 @@ #include <asm/atomic.h> #include <asm/semaphore.h> -#define CPUSET_SUPER_MAGIC 0x27e0eb +#define CPUSET_SUPER_MAGIC 0x27e0eb + +/* + * Tracks how many cpusets are currently defined in system. + * When there is only one cpuset (the root cpuset) we can + * short circuit some hooks. + */ +int number_of_cpusets __read_mostly; + +/* See "Frequency meter" comments, below. */ + +struct fmeter { + int cnt; /* unprocessed events count */ + int val; /* most recent output value */ + time_t time; /* clock (secs) when val computed */ + spinlock_t lock; /* guards read or write of above */ +}; struct cpuset { unsigned long flags; /* "unsigned long" so bitops work */ @@ -80,13 +97,16 @@ struct cpuset { * Copy of global cpuset_mems_generation as of the most * recent time this cpuset changed its mems_allowed. */ - int mems_generation; + int mems_generation; + + struct fmeter fmeter; /* memory_pressure filter */ }; /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, + CS_MEMORY_MIGRATE, CS_REMOVED, CS_NOTIFY_ON_RELEASE } cpuset_flagbits_t; @@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs) return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); } +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + /* * Increment this atomic integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation @@ -137,13 +162,10 @@ static struct cpuset top_cpuset = { .count = ATOMIC_INIT(0), .sibling = LIST_HEAD_INIT(top_cpuset.sibling), .children = LIST_HEAD_INIT(top_cpuset.children), - .parent = NULL, - .dentry = NULL, - .mems_generation = 0, }; static struct vfsmount *cpuset_mount; -static struct super_block *cpuset_sb = NULL; +static struct super_block *cpuset_sb; /* * We have two global cpuset semaphores below. They can nest. @@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL; * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task->alloc_lock) already in the task_struct routinely used for * such matters. + * + * P.S. One more locking exception. RCU is used to guard the + * update of a tasks cpuset pointer by attach_task() and the + * access of task->cpuset->mems_generation via that pointer in + * the routine cpuset_update_task_memory_state(). */ static DECLARE_MUTEX(manage_sem); @@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) spin_lock(&dcache_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_child); + struct dentry *d = list_entry(node, struct dentry, d_u.d_child); list_del_init(node); if (d->d_inode) { d = dget_locked(d); @@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) } node = dentry->d_subdirs.next; } - list_del_init(&dentry->d_child); + list_del_init(&dentry->d_u.d_child); spin_unlock(&dcache_lock); remove_dir(dentry); } @@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) BUG_ON(!nodes_intersects(*pmask, node_online_map)); } -/* - * Refresh current tasks mems_allowed and mems_generation from current - * tasks cpuset. +/** + * cpuset_update_task_memory_state - update task memory placement + * + * If the current tasks cpusets mems_allowed changed behind our + * backs, update current->mems_allowed, mems_generation and task NUMA + * mempolicy to the new value. * - * Call without callback_sem or task_lock() held. May be called with - * or without manage_sem held. Will acquire task_lock() and might - * acquire callback_sem during call. + * Task mempolicy is updated by rebinding it relative to the + * current->cpuset if a task has its memory placement changed. + * Do not call this routine if in_interrupt(). * - * The task_lock() is required to dereference current->cpuset safely. - * Without it, we could pick up the pointer value of current->cpuset - * in one instruction, and then attach_task could give us a different - * cpuset, and then the cpuset we had could be removed and freed, - * and then on our next instruction, we could dereference a no longer - * valid cpuset pointer to get its mems_generation field. + * Call without callback_sem or task_lock() held. May be called + * with or without manage_sem held. Doesn't need task_lock to guard + * against another task changing a non-NULL cpuset pointer to NULL, + * as that is only done by a task on itself, and if the current task + * is here, it is not simultaneously in the exit code NULL'ing its + * cpuset pointer. This routine also might acquire callback_sem and + * current->mm->mmap_sem during call. + * + * Reading current->cpuset->mems_generation doesn't need task_lock + * to guard the current->cpuset derefence, because it is guarded + * from concurrent freeing of current->cpuset by attach_task(), + * using RCU. + * + * The rcu_dereference() is technically probably not needed, + * as I don't actually mind if I see a new cpuset pointer but + * an old value of mems_generation. However this really only + * matters on alpha systems using cpusets heavily. If I dropped + * that rcu_dereference(), it would save them a memory barrier. + * For all other arch's, rcu_dereference is a no-op anyway, and for + * alpha systems not using cpusets, another planned optimization, + * avoiding the rcu critical section for tasks in the root cpuset + * which is statically allocated, so can't vanish, will make this + * irrelevant. Better to use RCU as intended, than to engage in + * some cute trick to save a memory barrier that is impossible to + * test, for alpha systems using cpusets heavily, which might not + * even exist. * * This routine is needed to update the per-task mems_allowed data, * within the tasks context, when it is trying to allocate memory @@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * task has been modifying its cpuset. */ -static void refresh_mems(void) +void cpuset_update_task_memory_state() { int my_cpusets_mem_gen; + struct task_struct *tsk = current; + struct cpuset *cs; - task_lock(current); - my_cpusets_mem_gen = current->cpuset->mems_generation; - task_unlock(current); - - if (current->cpuset_mems_generation != my_cpusets_mem_gen) { - struct cpuset *cs; - nodemask_t oldmem = current->mems_allowed; + if (tsk->cpuset == &top_cpuset) { + /* Don't need rcu for top_cpuset. It's never freed. */ + my_cpusets_mem_gen = top_cpuset.mems_generation; + } else { + rcu_read_lock(); + cs = rcu_dereference(tsk->cpuset); + my_cpusets_mem_gen = cs->mems_generation; + rcu_read_unlock(); + } + if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { down(&callback_sem); - task_lock(current); - cs = current->cpuset; - guarantee_online_mems(cs, ¤t->mems_allowed); - current->cpuset_mems_generation = cs->mems_generation; - task_unlock(current); + task_lock(tsk); + cs = tsk->cpuset; /* Maybe changed when task not locked */ + guarantee_online_mems(cs, &tsk->mems_allowed); + tsk->cpuset_mems_generation = cs->mems_generation; + task_unlock(tsk); up(&callback_sem); - if (!nodes_equal(oldmem, current->mems_allowed)) - numa_policy_rebind(&oldmem, ¤t->mems_allowed); + mpol_rebind_task(tsk, &tsk->mems_allowed); } } @@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf) } /* + * Handle user request to change the 'mems' memory placement + * of a cpuset. Needs to validate the request, update the + * cpusets mems_allowed and mems_generation, and for each + * task in the cpuset, rebind any vma mempolicies and if + * the cpuset is marked 'memory_migrate', migrate the tasks + * pages to the new memory. + * * Call with manage_sem held. May take callback_sem during call. + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, + * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; + nodemask_t oldmem; + struct task_struct *g, *p; + struct mm_struct **mmarray; + int i, n, ntasks; + int migrate; + int fudge; int retval; trialcs = *cs; retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) - return retval; + goto done; nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); - if (nodes_empty(trialcs.mems_allowed)) - return -ENOSPC; + oldmem = cs->mems_allowed; + if (nodes_equal(oldmem, trialcs.mems_allowed)) { + retval = 0; /* Too easy - nothing to do */ + goto done; + } + if (nodes_empty(trialcs.mems_allowed)) { + retval = -ENOSPC; + goto done; + } retval = validate_change(cs, &trialcs); - if (retval == 0) { - down(&callback_sem); - cs->mems_allowed = trialcs.mems_allowed; - atomic_inc(&cpuset_mems_generation); - cs->mems_generation = atomic_read(&cpuset_mems_generation); - up(&callback_sem); + if (retval < 0) + goto done; + + down(&callback_sem); + cs->mems_allowed = trialcs.mems_allowed; + atomic_inc(&cpuset_mems_generation); + cs->mems_generation = atomic_read(&cpuset_mems_generation); + up(&callback_sem); + + set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ + + fudge = 10; /* spare mmarray[] slots */ + fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ + retval = -ENOMEM; + + /* + * Allocate mmarray[] to hold mm reference for each task + * in cpuset cs. Can't kmalloc GFP_KERNEL while holding + * tasklist_lock. We could use GFP_ATOMIC, but with a + * few more lines of code, we can retry until we get a big + * enough mmarray[] w/o using GFP_ATOMIC. + */ + while (1) { + ntasks = atomic_read(&cs->count); /* guess */ + ntasks += fudge; + mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); + if (!mmarray) + goto done; + write_lock_irq(&tasklist_lock); /* block fork */ + if (atomic_read(&cs->count) <= ntasks) + break; /* got enough */ + write_unlock_irq(&tasklist_lock); /* try again */ + kfree(mmarray); } + + n = 0; + + /* Load up mmarray[] with mm reference for each task in cpuset. */ + do_each_thread(g, p) { + struct mm_struct *mm; + + if (n >= ntasks) { + printk(KERN_WARNING + "Cpuset mempolicy rebind incomplete.\n"); + continue; + } + if (p->cpuset != cs) + continue; + mm = get_task_mm(p); + if (!mm) + continue; + mmarray[n++] = mm; + } while_each_thread(g, p); + write_unlock_irq(&tasklist_lock); + + /* + * Now that we've dropped the tasklist spinlock, we can + * rebind the vma mempolicies of each mm in mmarray[] to their + * new cpuset, and release that mm. The mpol_rebind_mm() + * call takes mmap_sem, which we couldn't take while holding + * tasklist_lock. Forks can happen again now - the mpol_copy() + * cpuset_being_rebound check will catch such forks, and rebind + * their vma mempolicies too. Because we still hold the global + * cpuset manage_sem, we know that no other rebind effort will + * be contending for the global variable cpuset_being_rebound. + * It's ok if we rebind the same mm twice; mpol_rebind_mm() + * is idempotent. Also migrate pages in each mm to new nodes. + */ + migrate = is_memory_migrate(cs); + for (i = 0; i < n; i++) { + struct mm_struct *mm = mmarray[i]; + + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) { + do_migrate_pages(mm, &oldmem, &cs->mems_allowed, + MPOL_MF_MOVE_ALL); + } + mmput(mm); + } + + /* We're done rebinding vma's to this cpusets new mems_allowed. */ + kfree(mmarray); + set_cpuset_being_rebound(NULL); + retval = 0; +done: return retval; } /* + * Call with manage_sem held. + */ + +static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) +{ + if (simple_strtoul(buf, NULL, 10) != 0) + cpuset_memory_pressure_enabled = 1; + else + cpuset_memory_pressure_enabled = 0; + return 0; +} + +/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, - * CS_NOTIFY_ON_RELEASE) + * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * @@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) } /* + * Frequency meter - How fast is some event occuring? + * + * These routines manage a digitally filtered, constant time based, + * event frequency meter. There are four routines: + * fmeter_init() - initialize a frequency meter. + * fmeter_markevent() - called each time the event happens. + * fmeter_getrate() - returns the recent rate of such events. + * fmeter_update() - internal routine used to update fmeter. + * + * A common data structure is passed to each of these routines, + * which is used to keep track of the state required to manage the + * frequency meter and its digital filter. + * + * The filter works on the number of events marked per unit time. + * The filter is single-pole low-pass recursive (IIR). The time unit + * is 1 second. Arithmetic is done using 32-bit integers scaled to + * simulate 3 decimal digits of precision (multiplied by 1000). + * + * With an FM_COEF of 933, and a time base of 1 second, the filter + * has a half-life of 10 seconds, meaning that if the events quit + * happening, then the rate returned from the fmeter_getrate() + * will be cut in half each 10 seconds, until it converges to zero. + * + * It is not worth doing a real infinitely recursive filter. If more + * than FM_MAXTICKS ticks have elapsed since the last filter event, + * just compute FM_MAXTICKS ticks worth, by which point the level + * will be stable. + * + * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid + * arithmetic overflow in the fmeter_update() routine. + * + * Given the simple 32 bit integer arithmetic used, this meter works + * best for reporting rates between one per millisecond (msec) and + * one per 32 (approx) seconds. At constant rates faster than one |