diff options
Diffstat (limited to 'kernel')
53 files changed, 822 insertions, 333 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 170a9213c1b..e4791b3ba55 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ +obj-$(CONFIG_FREEZER) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o diff --git a/kernel/async.c b/kernel/async.c index 608b32b4281..f565891f2c9 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -54,6 +54,7 @@ asynchronous and synchronous parts of the kernel. #include <linux/sched.h> #include <linux/init.h> #include <linux/kthread.h> +#include <linux/delay.h> #include <asm/atomic.h> static async_cookie_t next_cookie = 1; @@ -132,21 +133,23 @@ static void run_one_entry(void) entry = list_first_entry(&async_pending, struct async_entry, list); /* 2) move it to the running queue */ - list_del(&entry->list); - list_add_tail(&entry->list, &async_running); + list_move_tail(&entry->list, entry->running); spin_unlock_irqrestore(&async_lock, flags); /* 3) run it (and print duration)*/ if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); + printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, + entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); if (initcall_debug && system_state == SYSTEM_BOOTING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, - entry->func, ktime_to_ns(delta) >> 10); + printk("initcall %lli_%pF returned 0 after %lld usecs\n", + (long long)entry->cookie, + entry->func, + (long long)ktime_to_ns(delta) >> 10); } /* 4) remove it from the running queue */ @@ -205,18 +208,44 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l return newcookie; } +/** + * async_schedule - schedule a function for asynchronous execution + * @ptr: function to execute asynchronously + * @data: data pointer to pass to the function + * + * Returns an async_cookie_t that may be used for checkpointing later. + * Note: This function may be called from atomic or non-atomic contexts. + */ async_cookie_t async_schedule(async_func_ptr *ptr, void *data) { - return __async_schedule(ptr, data, &async_pending); + return __async_schedule(ptr, data, &async_running); } EXPORT_SYMBOL_GPL(async_schedule); -async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running) +/** + * async_schedule_domain - schedule a function for asynchronous execution within a certain domain + * @ptr: function to execute asynchronously + * @data: data pointer to pass to the function + * @running: running list for the domain + * + * Returns an async_cookie_t that may be used for checkpointing later. + * @running may be used in the async_synchronize_*_domain() functions + * to wait within a certain synchronization domain rather than globally. + * A synchronization domain is specified via the running queue @running to use. + * Note: This function may be called from atomic or non-atomic contexts. + */ +async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, + struct list_head *running) { return __async_schedule(ptr, data, running); } -EXPORT_SYMBOL_GPL(async_schedule_special); +EXPORT_SYMBOL_GPL(async_schedule_domain); +/** + * async_synchronize_full - synchronize all asynchronous function calls + * + * This function waits until all asynchronous function calls have been done. + */ void async_synchronize_full(void) { do { @@ -225,13 +254,30 @@ void async_synchronize_full(void) } EXPORT_SYMBOL_GPL(async_synchronize_full); -void async_synchronize_full_special(struct list_head *list) +/** + * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain + * @list: running list to synchronize on + * + * This function waits until all asynchronous function calls for the + * synchronization domain specified by the running list @list have been done. + */ +void async_synchronize_full_domain(struct list_head *list) { - async_synchronize_cookie_special(next_cookie, list); + async_synchronize_cookie_domain(next_cookie, list); } -EXPORT_SYMBOL_GPL(async_synchronize_full_special); +EXPORT_SYMBOL_GPL(async_synchronize_full_domain); -void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running) +/** + * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing + * @cookie: async_cookie_t to use as checkpoint + * @running: running list to synchronize on + * + * This function waits until all asynchronous function calls for the + * synchronization domain specified by the running list @list submitted + * prior to @cookie have been done. + */ +void async_synchronize_cookie_domain(async_cookie_t cookie, + struct list_head *running) { ktime_t starttime, delta, endtime; @@ -247,14 +293,22 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r delta = ktime_sub(endtime, starttime); printk("async_continuing @ %i after %lli usec\n", - task_pid_nr(current), ktime_to_ns(delta) >> 10); + task_pid_nr(current), + (long long)ktime_to_ns(delta) >> 10); } } -EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); +EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); +/** + * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing + * @cookie: async_cookie_t to use as checkpoint + * + * This function waits until all asynchronous function calls prior to @cookie + * have been done. + */ void async_synchronize_cookie(async_cookie_t cookie) { - async_synchronize_cookie_special(cookie, &async_running); + async_synchronize_cookie_domain(cookie, &async_running); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); @@ -315,7 +369,11 @@ static int async_manager_thread(void *unused) ec = atomic_read(&entry_count); while (tc < ec && tc < MAX_THREADS) { - kthread_run(async_thread, NULL, "async/%i", tc); + if (IS_ERR(kthread_run(async_thread, NULL, "async/%i", + tc))) { + msleep(100); + continue; + } atomic_inc(&thread_count); tc++; } @@ -330,7 +388,9 @@ static int async_manager_thread(void *unused) static int __init async_init(void) { if (async_enabled) - kthread_run(async_manager_thread, NULL, "async/mgr"); + if (IS_ERR(kthread_run(async_manager_thread, NULL, + "async/mgr"))) + async_enabled = 0; return 0; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c29831076e7..9edb5c4b79b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1115,13 +1115,15 @@ static void cgroup_kill_sb(struct super_block *sb) { } write_unlock(&css_set_lock); - list_del(&root->root_list); - root_count--; + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + root_count--; + } mutex_unlock(&cgroup_mutex); - kfree(root); kill_litter_super(sb); + kfree(root); } static struct file_system_type cgroup_fs_type = { @@ -2349,7 +2351,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->root == root) - mutex_lock_nested(&ss->hierarchy_mutex, i); + mutex_lock(&ss->hierarchy_mutex); } } @@ -2434,7 +2436,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_remove: + cgroup_lock_hierarchy(root); list_del(&cgrp->sibling); + cgroup_unlock_hierarchy(root); root->number_of_cgroups--; err_destroy: @@ -2507,7 +2511,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; int refcnt; - do { + while (1) { /* We can only remove a CSS with a refcnt==1 */ refcnt = atomic_read(&css->refcnt); if (refcnt > 1) { @@ -2521,7 +2525,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) * css_tryget() to spin until we set the * CSS_REMOVED bits or abort */ - } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); + if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) + break; + cpu_relax(); + } } done: for_each_subsys(cgrp->root, ss) { @@ -2630,6 +2637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; } @@ -2991,20 +2999,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&cgroup_mutex); return 0; } - task_lock(tsk); - cg = tsk->cgroups; - parent = task_cgroup(tsk, subsys->subsys_id); /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { + if (!atomic_inc_not_zero(&root->sb->s_active)) { /* We race with the final deactivate_super() */ mutex_unlock(&cgroup_mutex); return 0; } /* Keep the cgroup alive */ + task_lock(tsk); + parent = task_cgroup(tsk, subsys->subsys_id); + cg = tsk->cgroups; get_css_set(cg); task_unlock(tsk); + mutex_unlock(&cgroup_mutex); /* Now do the VFS work to create a cgroup */ @@ -3043,7 +3052,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&inode->i_mutex); put_css_set(cg); - deactivate_super(parent->root->sb); + deactivate_super(root->sb); /* The cgroup is still accessible in the VFS, but * we're not going to try to rmdir() it at this * point. */ @@ -3069,7 +3078,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_lock(&cgroup_mutex); put_css_set(cg); mutex_unlock(&cgroup_mutex); - deactivate_super(parent->root->sb); + deactivate_super(root->sb); return ret; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a85678865c5..f76db9dcaa0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,6 +61,14 @@ #include <linux/cgroup.h> /* + * Workqueue for cpuset related tasks. + * + * Using kevent workqueue may cause deadlock when memory_migrate + * is set. So we create a separate workqueue thread for cpuset. + */ +static struct workqueue_struct *cpuset_wq; + +/* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can * short circuit some hooks. @@ -831,7 +839,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); */ static void async_rebuild_sched_domains(void) { - schedule_work(&rebuild_sched_domains_work); + queue_work(cpuset_wq, &rebuild_sched_domains_work); } /* @@ -2111,6 +2119,9 @@ void __init cpuset_init_smp(void) hotcpu_notifier(cpuset_track_online_cpus, 0); hotplug_memory_notifier(cpuset_track_online_nodes, 10); + + cpuset_wq = create_singlethread_workqueue("cpuset"); + BUG_ON(!cpuset_wq); } /** diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 038707404b7..962a3b574f2 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -98,7 +98,7 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); * @size: size of requested memory area * @dma_handle: This will be filled with the correct dma handle * @ret: This pointer will be filled with the virtual address - * to allocated area. + * to allocated area. * * This function should be only called from per-arch dma_alloc_coherent() * to support allocation from per-device coherent memory pools. @@ -118,31 +118,32 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, mem = dev->dma_mem; if (!mem) return 0; - if (unlikely(size > mem->size)) - return 0; + + *ret = NULL; + + if (unlikely(size > (mem->size << PAGE_SHIFT))) + goto err; pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); - if (pageno >= 0) { - /* - * Memory was found in the per-device arena. - */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); - *ret = mem->virt_base + (pageno << PAGE_SHIFT); - memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { - /* - * The per-device arena is exhausted and we are not - * permitted to fall back to generic memory. - */ - *ret = NULL; - } else { - /* - * The per-device arena is exhausted and we are - * permitted to fall back to generic memory. - */ - return 0; - } + if (unlikely(pageno < 0)) + goto err; + + /* + * Memory was found in the per-device area. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *ret = mem->virt_base + (pageno << PAGE_SHIFT); + memset(*ret, 0, size); + return 1; + +err: + /* + * In the case where the allocation can not be satisfied from the + * per-device area, try to fall back to generic memory if the + * constraints allow it. + */ + return mem->flags & DMA_MEMORY_EXCLUSIVE; } EXPORT_SYMBOL(dma_alloc_from_coherent); diff --git a/kernel/exit.c b/kernel/exit.c index f80dec3f187..efd30ccf385 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } diff --git a/kernel/fork.c b/kernel/fork.c index bf0cef8bbdf..a66fbde2071 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,17 +817,17 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; - int ret; if (clone_flags & CLONE_THREAD) { - ret = thread_group_cputime_clone_thread(current); - if (likely(!ret)) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - } - return ret; + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + + if (sig) + posix_cpu_timers_init_group(sig); + tsk->signal = sig; if (!sig) return -ENOMEM; @@ -851,21 +851,20 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->tty_old_pgrp = NULL; sig->tty = NULL; - sig->cutime = sig->cstime = cputime_zero; + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); + sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - posix_cpu_timers_init_group(sig); - acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@ -1007,6 +1006,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ + retval = -EAGAIN; if (nr_threads >= max_threads) goto bad_fork_cleanup_count; @@ -1095,7 +1095,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - if (unlikely(ptrace_reparented(current))) + if (unlikely(current->ptrace)) ptrace_fork(p, clone_flags); /* Perform scheduler related setup. Assign this task to a CPU. */ diff --git a/kernel/futex.c b/kernel/futex.c index f89d373a9c6..438701adce2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1165,6 +1165,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { struct task_struct *curr = current; + struct restart_block *restart; DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; @@ -1216,11 +1217,13 @@ retry: if (!ret) goto retry; - return ret; + goto out; } ret = -EWOULDBLOCK; - if (uval != val) - goto out_unlock_put_key; + if (unlikely(uval != val)) { + queue_unlock(&q, hb); + goto out_put_key; + } /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); @@ -1284,38 +1287,38 @@ retry: */ /* If we were woken (and unqueued), we succeeded, whatever. */ + ret = 0; if (!unqueue_me(&q)) - return 0; + goto out_put_key; + ret = -ETIMEDOUT; if (rem) - return -ETIMEDOUT; + goto out_put_key; /* * We expect signal_pending(current), but another thread may * have handled it for us already. */ + ret = -ERESTARTSYS; if (!abs_time) - return -ERESTARTSYS; - else { - struct restart_block *restart; - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_wait_restart; - restart->futex.uaddr = (u32 *)uaddr; - restart->futex.val = val; - restart->futex.time = abs_time->tv64; - restart->futex.bitset = bitset; - restart->futex.flags = 0; - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; - return -ERESTART_RESTARTBLOCK; - } + goto out_put_key; -out_unlock_put_key: - queue_unlock(&q, hb); - put_futex_key(fshared, &q.key); + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_wait_restart; + restart->futex.uaddr = (u32 *)uaddr; + restart->futex.val = val; + restart->futex.time = abs_time->tv64; + restart->futex.bitset = bitset; + restart->futex.flags = 0; + + if (fshared) + restart->futex.flags |= FLAGS_SHARED; + if (clockrt) + restart->futex.flags |= FLAGS_CLOCKRT; + ret = -ERESTART_RESTARTBLOCK; + +out_put_key: + put_futex_key(fshared, &q.key); out: return ret; } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2dc30c59c5f..f394d2a42ca 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) continue; timer = rb_entry(base->first, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + /* + * clock_was_set() has changed base->offset so the + * result might be negative. Fix it up to prevent a + * false positive in clockevents_program_event() + */ + if (expires.tv64 < 0) + expires.tv64 = 0; if (expires.tv64 < cpu_base->expires_next.tv64) cpu_base->expires_next = expires; } @@ -614,7 +621,9 @@ void clock_was_set(void) */ void hres_timers_resume(void) { - /* Retrigger the CPU local events: */ + WARN_ONCE(!irqs_disabled(), + KERN_INFO "hres_timers_resume() called with IRQs enabled!"); + retrigger_next_event(NULL); } @@ -1156,6 +1165,29 @@ static void __run_hrtimer(struct hrtimer *timer) #ifdef CONFIG_HIGH_RES_TIMERS +static int force_clock_reprogram; + +/* + * After 5 iteration's attempts, we consider that hrtimer_interrupt() + * is hanging, which could happen with something that slows the interrupt + * such as the tracing. Then we force the clock reprogramming for each future + * hrtimer interrupts to avoid infinite loops and use the min_delta_ns + * threshold that we will overwrite. + * The next tick event will be scheduled to 3 times we currently spend on + * hrtimer_interrupt(). This gives a good compromise, the cpus will spend + * 1/4 of their time to process the hrtimer interrupts. This is enough to + * let it running without serious starvation. + */ + +static inline void +hrtimer_interrupt_hanging(struct clock_event_device *dev, + ktime_t try_time) +{ + force_clock_reprogram = 1; + dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; + printk(KERN_WARNING "hrtimer: interrupt too slow, " + "forcing clock min delta to %lu ns\n", dev->min_delta_ns); +} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1165,6 +1197,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; ktime_t expires_next, now; + int nr_retries = 0; int i; BUG_ON(!cpu_base->hres_active); @@ -1172,6 +1205,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; retry: + /* 5 retries is enough to notice a hang */ + if (!(++nr_retries % 5)) + hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); + now = ktime_get(); expires_next.tv64 = KTIME_MAX; @@ -1224,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, 0)) + if (tick_program_event(expires_next, force_clock_reprogram)) goto retry; } } @@ -1578,6 +1615,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, break; #ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + case CPU_DYING_FROZEN: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: { diff --git a/kernel/irq/c |