diff options
Diffstat (limited to 'kernel')
59 files changed, 1763 insertions, 857 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index dd68b905941..f6006a60df5 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, #endif spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; + tty = current->signal->tty; /* Safe as we hold the siglock */ ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 59cedfb040e..cf5bc2f5f9c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -246,8 +246,8 @@ static int audit_match_perm(struct audit_context *ctx, int mask) unsigned n; if (unlikely(!ctx)) return 0; - n = ctx->major; + switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ if ((mask & AUDIT_PERM_WRITE) && @@ -1204,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); - mutex_lock(&tty_mutex); - read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; - read_unlock(&tasklist_lock); + spin_unlock_irq(&tsk->sighand->siglock); + audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" @@ -1230,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->egid, context->sgid, context->fsgid, tty, tsk->sessionid); - mutex_unlock(&tty_mutex); audit_log_task_info(ab, tsk); if (context->filterkey) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13932abde15..8c6e1c17e6d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2735,21 +2735,24 @@ void cgroup_fork_callbacks(struct task_struct *child) * Called on every change to mm->owner. mm_init_owner() does not * invoke this routine, since it assigns the mm->owner the first time * and does not change it. + * + * The callbacks are invoked with mmap_sem held in read mode. */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { - struct cgroup *oldcgrp, *newcgrp; + struct cgroup *oldcgrp, *newcgrp = NULL; if (need_mm_owner_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; oldcgrp = task_cgroup(old, ss->subsys_id); - newcgrp = task_cgroup(new, ss->subsys_id); + if (new) + newcgrp = task_cgroup(new, ss->subsys_id); if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp); + ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); } } } diff --git a/kernel/compat.c b/kernel/compat.c index 72650e39b3e..8eafe3eb50d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -27,6 +27,64 @@ #include <asm/uaccess.h> +/* + * Note that the native side is already converted to a timespec, because + * that's what we want anyway. + */ +static int compat_get_timeval(struct timespec *o, + struct compat_timeval __user *i) +{ + long usec; + + if (get_user(o->tv_sec, &i->tv_sec) || + get_user(usec, &i->tv_usec)) + return -EFAULT; + o->tv_nsec = usec * 1000; + return 0; +} + +static int compat_put_timeval(struct compat_timeval __user *o, + struct timeval *i) +{ + return (put_user(i->tv_sec, &o->tv_sec) || + put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; +} + +asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + if (tv) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (compat_put_timeval(tv, &ktv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + struct timespec kts; + struct timezone ktz; + + if (tv) { + if (compat_get_timeval(&kts, tv)) + return -EFAULT; + } + if (tz) { + if (copy_from_user(&ktz, tz, sizeof(ktz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); +} + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c24..86d49045dae 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) struct take_cpu_down_param *param = _param; int err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); + /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); @@ -453,6 +454,25 @@ out: } #endif /* CONFIG_PM_SLEEP_SMP */ +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + unsigned long val = CPU_STARTING; + +#ifdef CONFIG_PM_SLEEP_SMP + if (cpu_isset(cpu, frozen_cpus)) + val = CPU_STARTING_FROZEN; +#endif /* CONFIG_PM_SLEEP_SMP */ + raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); +} + #endif /* CONFIG_SMP */ /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 827cd9adccb..eab7bd6628e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(const struct cpuset *root) +static void scan_for_empty_cpusets(struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index c1d4d5b4c61..f013a0c2e11 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, } return (mem != NULL); } +EXPORT_SYMBOL(dma_alloc_from_coherent); /** * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool @@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) } return 0; } +EXPORT_SYMBOL(dma_release_from_coherent); diff --git a/kernel/dma.c b/kernel/dma.c index d2c60a82279..f903189c530 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -1,4 +1,4 @@ -/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ +/* * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. * * Written by Hennus Bergman, 1992. diff --git a/kernel/exit.c b/kernel/exit.c index 40036ac0427..059b38cae38 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -580,8 +580,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@ -624,29 +622,38 @@ retry: } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL, + * so that subsystems can understand the callback and take action. + */ + down_write(&mm->mmap_sem); + cgroup_mm_owner_callbacks(mm->owner, NULL); + mm->owner = NULL; + up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); + read_unlock(&tasklist_lock); + down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ diff --git a/kernel/fork.c b/kernel/fork.c index 021ae012cc7..44e64d7ba29 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -826,6 +826,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; + sig->tty = NULL; sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; @@ -853,6 +854,7 @@ void __cleanup_signal(struct signal_struct *sig) { thread_group_cputime_free(sig); exit_thread_group_keys(sig); + tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } @@ -1235,7 +1237,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; - p->signal->tty = current->signal->tty; + tty_kref_put(p->signal->tty); + p->signal->tty = tty_kref_get(current->signal->tty); set_task_pgrp(p, task_pgrp_nr(current)); set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce80a7..95978f48e03 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ BUG_ON(timer->function(timer) != HRTIMER_NORESTART); return 1; - case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: + case HRTIMER_CB_IRQSAFE_PERCPU: + case HRTIMER_CB_IRQSAFE_UNLOCKED: /* * This is solely for the sched tick emulation with * dynamic tick support to ensure that we do not * restart the tick right on the edge and end up with * the tick timer in the softirq ! The calling site - * takes care of this. + * takes care of this. Also used for hrtimer sleeper ! */ debug_hrtimer_deactivate(timer); return 1; @@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer) timer_stats_account_hrtimer(timer); fn = timer->function; - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { /* * Used for scheduler timers, avoid lock inversion with * rq->lock and tasklist_lock. @@ -1401,9 +1403,7 @@ void hrtimer_run_queues(void) if (!base->first) continue; - if (base->get_softirq_time) - base->softirq_time = base->get_softirq_time(); - else if (gettime) { + if (gettime) { hrtimer_get_softirq_time(cpu_base); gettime = 0; } @@ -1452,7 +1452,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; #ifdef CONFIG_HIGH_RES_TIMERS - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; #endif } @@ -1591,49 +1591,123 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) +static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base, int dcpu) { struct hrtimer *timer; struct rb_node *node; + int raise = 0; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); - __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); + + /* + * Should not happen. Per CPU timers should be + * canceled _before_ the migration code is called + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { + __remove_hrtimer(timer, old_base, + HRTIMER_STATE_INACTIVE, 0); + WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", + timer, timer->function, dcpu); + continue; + } + + /* + * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* * Enqueue the timer. Allow reprogramming of the event device */ enqueue_hrtimer(timer, new_base, 1); + +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Happens with high res enabled when the timer was + * already expired and the callback mode is + * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The + * enqueue code does not move them to the soft irq + * pending list for performance/latency reasons, but + * in the migration state, we need to do that + * otherwise we end up with a stale timer. + */ + if (timer->state == HRTIMER_STATE_MIGRATE) { + timer->state = HRTIMER_STATE_PENDING; + list_add_tail(&timer->cb_entry, + &new_base->cpu_base->cb_pending); + raise = 1; + } +#endif + /* Clear the migration state bit */ + timer->state &= ~HRTIMER_STATE_MIGRATE; } + return raise; } +#ifdef CONFIG_HIGH_RES_TIMERS +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + struct hrtimer *timer; + int raise = 0; + + while (!list_empty(&old_base->cb_pending)) { + timer = list_entry(old_base->cb_pending.next, + struct hrtimer, cb_entry); + + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); + timer->base = &new_base->clock_base[timer->base->index]; + list_add_tail(&timer->cb_entry, &new_base->cb_pending); + raise = 1; + } + return raise; +} +#else +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + return 0; +} +#endif + static void migrate_hrtimers(int cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, raise = 0; BUG_ON(cpu_online(cpu)); old_base = &per_cpu(hrtimer_bases, cpu); new_base = &get_cpu_var(hrtimer_bases); tick_cancel_sched_timer(cpu); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + if (migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i], cpu)) + raise = 1; } + if (migrate_hrtimer_pending(old_base, new_base)) + raise = 1; + spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); + + if (raise) + hrtimer_raise_softirq(); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0314074fa23..60c49e32439 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -89,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) set_balance_irq_affinity(irq, cpumask); #ifdef CONFIG_GENERIC_PENDING_IRQ - set_pending_irq(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + desc->chip->set_affinity(irq, cpumask); + spin_unlock_irqrestore(&desc->lock, flags); + } else + set_pending_irq(irq, cpumask); #else desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 38fc10ac754..5072cf1685a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr, /* see if it's in a module */ return module_address_lookup(addr, symbolsize, offset, modname, namebuf); - return NULL; } int lookup_symbol_name(unsigned long addr, char *symname) diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f3f0df35d..aef265325cd 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image, *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a - * destination page, so return it. + * destination page, so return it if it's + * gfp_flags honor the ones passed in. */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } addr = old_addr; page = old_page; break; diff --git a/kernel/kgdb.c b/kernel/kgdb.c index eaa21fc9ad1..e4dcfb2272a 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -488,7 +488,7 @@ static int write_mem_msg(int binary) if (err) return err; if (CACHE_FLUSH_IS_SAFE) - flush_icache_range(addr, addr + length + 1); + flush_icache_range(addr, addr + length); return 0; } @@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1432,6 +1433,7 @@ acquirelock: atomic_read(&kgdb_cpu_doing_single_step) != cpu) { atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1462,7 +1464,7 @@ acquirelock: * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active */ - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = 0; i < NR_CPUS; i++) atomic_set(&passive_cpu_wait[i], 1); } @@ -1475,7 +1477,7 @@ acquirelock: #ifdef CONFIG_SMP /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) + if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(flags); #endif @@ -1494,7 +1496,7 @@ acquirelock: kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); kgdb_deactivate_sw_breakpoints(); kgdb_single_step = 0; - kgdb_contthread = NULL; + kgdb_contthread = current; exception_level = 0; /* Talk to debugger with gdbserial protocol */ @@ -1508,7 +1510,7 @@ acquirelock: kgdb_info[ks->cpu].task = NULL; atomic_set(&cpu_in_kgdb[ks->cpu], 0); - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = NR_CPUS-1; i >= 0; i--) atomic_set(&passive_cpu_wait[i], 0); /* @@ -1524,6 +1526,7 @@ acquirelock: kgdb_restore: /* Free kgdb_active */ atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/kmod.c b/kernel/kmod.c index 2456d1a0bef..3d3c3ea3a02 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -113,7 +113,7 @@ int request_module(const char *fmt, ...) return ret; } EXPORT_SYMBOL(request_module); -#endif /* CONFIG_KMOD */ +#endif /* CONFIG_MODULES */ struct subprocess_info { struct work_struct work; @@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work) } } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user @@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) -static int usermodehelper_pm_callback(struct notifier_block *nfb, - unsigned long action, - void *ignored) +/** + * usermodehelper_disable - prevent new helpers from being started + */ +int usermodehelper_disable(void) { long retval; - switch (action) { - case PM_HIBERNATION_PREPARE: - case PM_SUSPEND_PREPARE: - usermodehelper_disabled = 1; - smp_mb(); - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, + usermodehelper_disabled = 1; + smp_mb(); + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, atomic_read(&running_helpers) == 0, RUNNING_HELPERS_TIMEOUT); - if (retval) { - return NOTIFY_OK; - } else { - usermodehelper_disabled = 0; - return NOTIFY_BAD; - } - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: - usermodehelper_disabled = 0; - return NOTIFY_OK; - } + if (retval) + return 0; - return NOTIFY_DONE; + usermodehelper_disabled = 0; + return -EAGAIN; +} + +/** + * usermodehelper_enable - allow new helpers to be started again + */ +void usermodehelper_enable(void) +{ + usermodehelper_disabled = 0; } static void helper_lock(void) @@ -334,18 +332,12 @@ static void helper_unlock(void) if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } - -static void register_pm_notifier_callback(void) -{ - pm_notifier(usermodehelper_pm_callback, 0); -} -#else /* CONFIG_PM */ +#else /* CONFIG_PM_SLEEP */ #define usermodehelper_disabled 0 static inline void helper_lock(void) {} static inline void helper_unlock(void) {} -static inline void register_pm_notifier_callback(void) {} -#endif /* CONFIG_PM */ +#endif /* CONFIG_PM_SLEEP */ /** * call_usermodehelper_setup - prepare to call a usermode helper @@ -515,5 +507,4 @@ void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); BUG_ON(!khelper_wq); - register_pm_notifier_callback(); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 75bc2cd9ebc..8b57a2597f2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e53bc30e9ba..08dd8ed86c7 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/kexec.h> +#include <linux/profile.h> #include <linux/sched.h> #define KERNEL_ATTR_RO(_name) \ @@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj, KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_PROFILING +static ssize_t profiling_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", prof_on); +} +static ssize_t profiling_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + + if (prof_on) + return -EEXIST; + /* + * This eventually calls into get_option() which + * has a ton of callers and is not const. It is + * easiest to cast it away here. + */ + profile_setup((char *)buf); + ret = profile_init(); + if (ret) + return ret; + ret = create_proc_profile(); + if (ret) + return ret; + return count; +} +KERNEL_ATTR_RW(profiling); +#endif + #ifdef CONFIG_KEXEC static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = { &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif +#ifdef CONFIG_PROFILING + &profiling_attr.attr, +#endif #ifdef CONFIG_KEXEC &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04..25bc9ac9e22 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod) static inline void add_taint_module(struct module *mod, unsigned flag) { add_taint(flag); - mod->taints |= flag; + mod->taints |= (1U << flag); } /* @@ -784,6 +784,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + unregister_dynamic_debug_module(mod->name); free_module(mod); out: @@ -923,7 +924,7 @@ static const char vermagic[] = VERMAGIC_STRING; static int try_to_force_load(struct module *mod, const char *symname) { #ifdef CONFIG_MODULE_FORCE_LOAD - if (!(tainted & TAINT_FORCED_MODULE)) + if (!test_taint(TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint_module(mod, TAINT_FORCED_MODULE); @@ -1033,7 +1034,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, const unsigned long *crc; ret = find_symbol(name, &owner, &crc, - !(mod->taints & TAINT_PROPRIETARY_MODULE), true); + !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!IS_ERR_VALUE(ret)) { /* use_module can fail due to OOM, or module initialization or unloading */ @@ -1173,7 +1174,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, while (i-- > 0) sysfs_remove_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); - kobject_del(notes_attrs->dir); + kobject_put(notes_attrs->dir); } kfree(notes_attrs); } @@ -1634,7 +1635,7 @@ static void set_license(struct module *mod, const char *license) license = "unspecified"; if (!license_is_gpl_compatible(license)) { - if (!(tainted & TAINT_PROPRIETARY_MODULE)) + if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); @@ -1783,6 +1784,33 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ +#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG +static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex) +{ + struct mod_debug *debug_info; + unsigned long pos, end; + unsigned int num_verbose; + + pos = sechdrs[verboseindex].sh_addr; + num_verbose = sechdrs[verboseindex].sh_size / + sizeof(struct mod_debug); + end = pos + (num_verbose * sizeof(struct mod_debug)); + + for (; pos < end; pos += sizeof(struct mod_debug)) { + debug_info = (struct mod_debug *)pos; + register_dynamic_debug_module(debug_info->modname, + debug_info->type, debug_info->logical_modname, + debug_info->flag_names, debug_info->hash, + debug_info->hash2); + } +} +#else +static inline void dynamic_printk_setup(Elf_Shdr *sechdrs, + unsigned int verboseindex) +{ +} +#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -1806,6 +1834,7 @@ static noinline struct module *load_module(void __user *umod, Elf_Ehdr *hdr; Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; + char *staging; unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; @@ -1831,6 +1860,7 @@ static noinline struct module *load_module(void __user *umod, #endif unsigned int markersindex; unsigned int markersstringsindex; + unsigned int verboseindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1960,6 +1990,14 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + staging = get_modinfo(sechdrs, infoindex, "staging"); + if (staging) { + add_taint_module(mod, TAINT_CRAP); + printk(KERN_WARNING "%s: module is from the staging directory," + " the quality is unknown, you have been warned.\n", + mod->name); + } + /* Now copy in args */ args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(args)) { @@ -2117,6 +2155,7 @@ static noinline struct module *load_module(void __user *umod, markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); + verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2167,6 +2206,7 @@ static noinline struct module *load_module(void __user *umod, marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif + dynamic_printk_setup(sechdrs, verboseindex); err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2552,10 +2592,12 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & TAINT_PROPRIETARY_MODULE) + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) buf[bx++] = 'P'; - if (mod->taints & TAINT_FORCED_MODULE) + if (mod->taints & (1 << TAINT_FORCED_MODULE)) buf[bx++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[bx++] = 'C'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't diff --git a/kernel/panic.c b/kernel/panic.c index 12c5a0a6c89..bda561ef3cd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,7 +23,7 @@ #include <linux/kallsyms.h> int panic_on_oops; -int tainted; +static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -143,6 +143,27 @@ NORET_TYPE void panic(const char * fmt, ...) EXPORT_SYMBOL(panic); + +struct tnt { + u8 bit; + char true; + char false; +}; + +static const struct tnt tnts[] = { + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, + { TAINT_CRAP, 'C', ' ' }, +}; + /** * print_tainted - return a string to represent the kernel taint state. * @@ -155,35 +176,45 @@ EXPORT_SYMBOL(panic); * 'U' - Userspace-defined naughtiness. * 'A' - ACPI table overridden. * 'W' - Taint on warning. + * 'C' - modules from drivers/staging are loaded. * * The string is overwritten by the next call to print_taint(). */ - const char *print_tainted(void) { - static char buf[20]; - if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", - tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', - tainted & TAINT_FORCED_MODULE ? 'F' : ' ', - tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', - tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', - tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' ', - tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); - } - else + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + + if (tainted_mask) { + char *s; + int i; + + s = buf + sprintf(buf, "Tainted: "); + for (i = 0; i < ARRAY_SIZE(tnts); i++) { + const struct tnt *t = &tnts[i]; + *s++ = test_bit(t->bit, &tainted_mask) ? + t->true : t->false; + } + *s = 0; + } else snprintf(buf, sizeof(buf), "Not tainted"); return(buf); } +int test_taint(unsigned flag) +{ + return test_bit(flag, &tainted_mask); +} +EXPORT_SYMBOL(test_taint); + +unsigned long get_taint(void) +{ + return tainted_mask; +} + void add_taint(unsigned flag) { debug_locks = 0; /* can't trust the integrity of the kernel anymore */ - tainted |= flag; + set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 95451bf7d2e..b931d7cedbf 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -223,6 +223,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) } /* + * Get monotonic time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ + getrawmonotonic(tp); + return 0; +} + +/* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) @@ -235,9 +244,15 @@ static __init int init_posix_timers(void) .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, }; + struct k_clock clock_monotonic_raw = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + .clock_set = do_posix_clock_nosettime, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -427,7 +442,7 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); - tmr = NULL; + return NULL; } memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); return tmr; diff --git a/kernel/power/disk.c b/kernel/power/disk.c index bbd85c60f74..331f9836383 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -14,6 +14,7 @@ #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> +#include <linux/kmod.h> #include <linux/delay.h> #include <linux/fs.h> #include <linux/mount.h> @@ -520,6 +521,10 @@ int hibernate(void) if (error) goto Exit; + error = usermodehelper_disable(); + if (error) + goto Exit; + /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) @@ -558,6 +563,7 @@ int hibernate(void) thaw_processes(); Finish: free_basic_memory_bitmaps(); + usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -634,6 +640,10 @@ static int software_resume(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + error = create_basic_memory_bitmaps(); if (error) goto Finish; @@ -656,6 +666,7 @@ static int software_resume(void) thaw_processes(); Done: free_basic_memory_bitmaps(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 540b16b6856..19122cf6d82 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -14,6 +14,7 @@ #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> +#include <linux/kmod.h> #include <linux/init.h> #include <linux/console.h> #include <linux/cpu.h> @@ -237,6 +238,10 @@ static int suspend_prepare(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + if (suspend_freeze_processes()) { error = -EAGAIN; goto Thaw; @@ -256,6 +261,7 @@ static int suspend_prepare(void) Thaw: suspend_thaw_processes(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); @@ -376,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + usermodehelper_enable(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } diff --git a/kernel/power/user.c b/kernel/power/user.c index a6332a31326..005b93d839b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREEZE: if (data->frozen) break; + printk("Syncing filesystems ... "); sys_sync(); printk("done.\n"); - error = freeze_processes(); + error = usermodehelper_disable(); if (error) + break; + + error = freeze_processes(); + if (error) { thaw_processes(); + usermodehelper_enable(); + } if (!error) data->frozen = 1; break; @@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!data->frozen || data->ready) break; thaw_processes(); + usermodehelper_enable(); data->frozen = 0; break; diff --git a/kernel/printk.c b/kernel/printk.c index b51b1567bb5..6341af77eb6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -13,7 +13,7 @@ * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock - * 01Mar01 Andrew Morton <andrewm@uow.edu.au> + * 01Mar01 Andrew Morton */ #include <linux/kernel.h> @@ -577,9 +577,6 @@ static int have_callable_console(void) * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. - * Be aware of the fact that if oops_in_progress is not set, we might try to - * wake klogd up which could deadlock on runqueue lock if printk() is called - * from scheduler code. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output @@ -593,6 +590,8 @@ static int have_callable_console(void) * * See also: * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. */ asmlinkage int printk(const char *fmt, ...) @@ -982,10 +981,25 @@ int is_console_locked(void) return console_locked; } -void wake_up_klogd(void) +static DEFINE_PER_CPU(int, printk_pending); + +void printk_tick(void) { - if (!oops_in_progress && waitqueue_active(&log_wait)) + if (__get_cpu_var(printk_pending)) { + __get_cpu_var(printk_pending) = 0; wake_up_interruptible(&log_wait); + } +} + +int printk_needs_cpu(int cpu) +{ + return per_cpu(printk_pending, cpu); +} + +void wake_up_klogd(void) +{ + if (waitqueue_active(&log_wait)) + __raw_get_cpu_var(printk_pending) = 1; } /** @@ -1291,22 +1305,6 @@ static int __init disable_boot_consoles(void) } late_initcall(disable_boot_consoles); -/** - * tty_write_message - write a message to a certain tty, not just the console. - * @tty: the destination tty_struct - * @msg: the message to write - * - * This is used for messages that need to be redirected to a specific tty. - * We don't put it into the syslog queue right now maybe in the future if - * really needed. - */ -void tty_write_message(struct tty_struct *tty, char *msg) -{ - if (tty && tty->ops->write) - tty->ops->write(tty, msg, strlen(msg)); - return; -} - #if defined CONFIG_PRINTK /* diff --git a/kernel/profile.c b/kernel/profile.c index cd26bed4cc2..a9e422df6bf 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -22,6 +22,8 @@ #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> #include <asm/sections.h> #include <asm/irq_regs.h> #include <asm/ptrace.h> @@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); #endif /* CONFIG_SMP */ -static int __init profile_setup(char *str) +int profile_setup(char *str) { - static char __initdata schedstr[] = "schedule"; - static char __initdata sleepstr[] = "sleep"; - static char __initdata kvmstr[] = "kvm"; + static char schedstr[] = "schedule"; + static char sleepstr[] = "sleep"; + static char kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { @@ -100,14 +102,33 @@ static int __init profile_setup(char *str) __setup("profile=", profile_setup); -void __init profile_init(void) +int profile_init(void) { + int buffer_bytes; if (!prof_on) - return; + return 0; /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; - prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); + buffer_bytes = prof_len*sizeof(atomic_t); + if (!slab_is_available()) { + prof_buffer = alloc_bootmem(buffer_bytes); + return 0; + } + + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + if (prof_buffer) + return 0; + + prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + if (prof_buffer) + return 0; + + prof_buffer = vmalloc(buffer_bytes); + if (prof_buffer) + return 0; + + return -ENOMEM; } /* Profile event notifications */ @@ -527,7 +548,7 @@ static void __init profile_nop(void *unused) { } -static int __init create_hash_tables(void) +static int create_hash_tables(void) { int cpu; @@ -575,14 +596,14 @@ out_cleanup: #define create_hash_tables() ({ 0; }) #endif -static int __init create_proc_profile(void) +int create_proc_profile(void) { struct proc_dir_entry *entry; if (!prof_on) return 0; if (create_hash_tables()) - return -1; + return -ENOMEM; entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index aad93cdc9f6..37f72e55154 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -47,6 +47,7 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> +#include <linux/time.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp, { int cpu; cpumask_t cpumask; + unsigned long flags; + set_need_resched(); + spin_lock_irqsave(&rcp->lock, flags); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } + spin_unlock_irqrestore(&rcp->lock, flags); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long batch; + + head->next = NULL; + smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ + + /* + * Determine the batch number of this callback. + * + * Using ACCESS_ONCE to avoid the following error when gcc eliminates + * local variable "batch" and emits codes like this: + * 1) rdp->batch = rcp->cur + 1 # gets old value + * ...... + * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value + * then [*nxttail[0], *nxttail[1]) may contain callbacks + * that batch# = rdp->batch, see the comment of struct rcu_data. + */ + batch = ACCESS_ONCE(rcp->cur) + 1; + + if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { + /* process callbacks */ + rdp->nxttail[0] = rdp->nxttail[1]; + rdp->nxttail[1] = rdp->nxttail[2]; + if (rcu_batch_after(batch - 1, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[2]; + } + + rdp->batch = batch; + *rdp->nxttail[2] = head; + rdp->nxttail[2] = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } +} + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } + __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - + __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); static inline void raise_rcu_softirq(void) { raise_softirq(RCU_SOFTIRQ); - /* - * The smp_mb() here is required to ensure that this cpu's - * __rcu_process_callbacks() reads the most recently updated - * value of rcu->cur. - */ - smp_mb(); } /* @@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void) */ static void rcu_do_batch(struct rcu_data *rdp) { + unsigned long flags; struct rcu_head *next, *list; int count = 0; @@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp) } rdp->donelist = list; - local_irq_disable(); + local_irq_save(flags); rdp->qlen -= count; - local_irq_enable(); + local_irq_restore(flags); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; @@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ + /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp) */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { - if (rcp->next_pending && + if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); rcp->cur++; + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; @@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; rdp->qs_pending = 0; - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. @@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } @@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, * which is dead and hence not processing interrupts. */ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) + struct rcu_head **tail, long batch) { - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); + unsigned long flags; + + if (list) { + local_irq_save(flags); + this_rdp->batch = batch; + *this_rdp->nxttail[2] = list; + this_rdp->nxttail[2] = tail; + local_irq_restore(flags); + } } static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* if the cpu going offline owns the grace period + unsigned long flags; + + /* + * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_bh(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); + spin_unlock(&rcp->lock); - local_irq_disable(); this_rdp->qlen += rdp->qlen; - local_irq_enable(); + local_irq_restore(flags); } static void rcu_offline_cpu(int cpu) @@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } + unsigned long flags; + long completed_snap; - if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); + if (rdp->nxtlist) { + local_irq_save(flags); + completed_snap = ACCESS_ONCE(rcp->completed); /* - * start the next batch of callbacks + * move the other grace-period-completed entries to + * [rdp->nxtlist, *rdp->nxttail[0]) temporarily */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; + else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) + rdp->nxttail[0] = rdp->nxttail[1]; - /* determine batch number */ - rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() + /* + * the grace period for entries in + * [rdp->nxtlist, *rdp->nxttail[0]) has completed and + * move these entries to donelist */ - smp_rmb(); + if (rdp->nxttail[0] != &rdp->nxtlist) { + *rdp->donetail = rdp->nxtlist; + rdp->donetail = rdp->nxttail[0]; + rdp->nxtlist = *rdp->nxttail[0]; + *rdp->donetail = NULL; + + if (rdp->nxttail[1] == rdp->nxttail[0]) + rdp->nxttail[1] = &rdp->nxtlist; + if (rdp->nxttail[2] == rdp->nxttail[0]) + rdp->nxttail[2] = &rdp->nxtlist; + rdp->nxttail[0] = &rdp->nxtlist; + } + + local_irq_restore(flags); + + if (rcu_batch_after(rdp->batch, rcp->pending)) { + unsigned long flags2; - if (!rcp->next_pending) { /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); - spin_unlock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags2); + if (rcu_batch_after(rdp->batch, rcp->pending)) { + rcp->pending = rdp->batch; + rcu_start_batch(rcp); + } + spin_unlock_irqrestore(&rcp->lock, flags2); } } @@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, static void rcu_process_callbacks(struct softirq_action *unused) { + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be see before any RCU + * grace-period manupulations below. + */ + + smp_mb(); /* See above block comment. */ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be see after any RCU + * grace-period manupulations above. + */ + + smp_mb(); /* See above block comment. */ } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp); - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; + if (rdp->nxtlist) { + long completed_snap = ACCESS_ONCE(rcp->completed); + + /* + * This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + return 1; + if (!rcu_batch_before(completed_snap, rdp->batch - 1) && + rdp->nxttail[0] != rdp->nxttail[1]) + return 1; + if (rdp->nxttail[0] != &rdp->nxtlist) + return 1; + + /* + * This cpu has pending rcu entries and the new batch + * for then hasn't been started nor scheduled start + */ + if (rcu_batch_after(rdp->batch, rcp->pending)) + return 1; + } /* This cpu has finished callbacks to invoke */ if (rdp->donelist) @@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu) struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); + return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } +/* + * Top-level function driving RCU grace-period detection, normally + * invoked from the scheduler-clock interrupt. This function simply + * increments counters that are read only from softirq by this same + * CPU, so there are no memory barriers required. + */ void rcu_check_callbacks(int cpu, int user) { if (user || @@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + + spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; + spin_unlock_irqrestore(&rcp->lock, flags); } static void __cpuinit rcu_online_cpu(int cpu) @@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 27827931ca0..ca4bbbe04aa 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -59,14 +59,6 @@ #include <linux/rcupreempt_trace.h> /* - * Macro that prevents the compiler from reordering accesses, but does - * absolutely -nothing- to prevent CPUs from reordering. This is used - * only to mediate communication between mainline code and hardware - * interrupt and NMI handlers. - */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - -/* * PREEMPT_RCU data structures. */ diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 5edf82c34bb..35c2d3360ec 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -308,11 +308,16 @@ out: static int __init rcupreempt_trace_init(void) { + int ret; + mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - return rcupreempt_debugfs_init(); + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; } static void __exit rcupreempt_trace_cleanup(void) diff --git a/kernel/resource.c b/kernel/resource.c index 03d796c1b2e..4089d12af6e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -#ifdef CONFIG_PROC_FS - -enum { MAX_IORES_LEVEL = 5 }; - static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { @@ -516,6 +516,70 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t return result; } +static void __init __reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + struct resource *parent = root; + struct resource *conflict; + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + + if (!res) + return; + + res->name = name; + res->start = start; + res->end = end; + res->flags = IORESOURCE_BUSY; + + for (;;) { + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + + if (!res) { + /* failed, split and try again */ + + /* conflict covered whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (!(conflict->flags & IORESOURCE_BUSY)) { + resource_size_t common_start, common_end; + + common_start = max(conflict->start, start); + common_end = min(conflict->end, end); + if (common_start < common_end) + __reserve_region_with_split(root, common_start, common_end, name); + } + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); + } + +} + +void reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + write_lock(&resource_lock); + __reserve_region_with_split(root, start, end, name); + write_unlock(&resource_lock); +} + EXPORT_SYMBOL(adjust_resource); /** @@ -562,33 +626,34 @@ struct resource * __request_region(struct resource *parent, { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); - if (res) { - res->name = name; - res->start = start; - res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; + if (!res) + return NULL; - write_lock(&resource_lock); + res->name = name; + res->start = start; + res->end = start + n - 1; + res->flags = IORESOURCE_BUSY; - for (;;) { - struct resource *conflict; + write_lock(&resource_lock); - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } + for (;;) { + struct resource *conflict; - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; + conflict = __request_resource(parent, res); + if (!conflict) break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; } - write_unlock(&resource_lock); + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; } + write_unlock(&resource_lock); return res; } EXPORT_SYMBOL(__request_region); @@ -763,3 +828,40 @@ static int __init reserve_setup(char *str) } __setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (p->start <= addr && (p->end >= addr + size - 1)) + continue; + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + (unsigned long long)addr, + (unsigned long long)(addr + size - 1), + (unsigned long long)p->start, + (unsigned long long)p->end, + p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} diff --git a/kernel/sched.c b/kernel/sched.c index ebb03def564..09a8c15748f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; +} + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; } static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -604,9 +609,9 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { - rq->curr->sched_class->check_preempt_curr(rq, p); + rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@ -1087,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_DONE; } -static void init_hrtick(void) +static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } @@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -static void init_hrtick(void) +static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@ -1119,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } -#else +#else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq) static inline void init_hrtick(void) { } -#endif +#endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. @@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1419,14 +1410,42 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; } +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else @@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; - if (!match_state || p->state == match_state) { - ncsw = p->nivcsw + p->nvcsw; - if (unlikely(!ncsw)) - ncsw = 1; - } + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* @@ -2285,7 +2300,7 @@ out_running: trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + check_preempt_curr(this_rq, p, 0); } /* @@ -4634,6 +4649,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ void complete(struct completion *x) { unsigned long flags; @@ -4645,6 +4669,12 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ void complete_all(struct completion *x) { unsigned long flags; @@ -4665,10 +4695,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { + if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } @@ -4696,12 +4723,31 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4709,6 +4755,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4718,6 +4771,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@ -4726,6 +4787,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); @@ -5128,7 +5196,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -5964,7 +6033,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p); + check_preempt_curr(rq_dest, p, 0); } done: ret = 1; @@ -6289,7 +6358,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(12); + struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@ -6317,7 +6386,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - /* &table[11] is terminator */ + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ return table; } @@ -7201,13 +7272,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + #define SD_INIT(sd, type) sd_init_##type(sd) + #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) @@ -8249,20 +8328,25 @@ void __might_sleep(char *file, int line) #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING && !oops_in_progress) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); @@ -8760,73 +8844,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8836,14 +8942,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; - goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8912,16 +9013,25 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { - struct task_group *tg = &root_task_group; - u64 rt_runtime, rt_period; + u64 runtime, period; int ret = 0; - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; @@ -8932,6 +9042,9 @@ static int sched_rt_global_constraints(void) unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -8992,7 +9105,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@ -9001,9 +9113,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cgrp; - return &tg->css; } diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096ddfe..81787248b60 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) /* * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); + * max(scd->tick_gtod, scd->clock), + * max(scd->clock, scd->tick_gtod + TICK_NSEC)); */ clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = scd->tick_gtod + TICK_NSEC; + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index bbe6b31c3c5..ad958c1ec70 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) unsigned long flags; int num_threads = 1; - rcu_read_lock(); if (lock_task_sighand(p, &flags)) { num_threads = atomic_read(&p->signal->count); unlock_task_sighand(p, &flags); } - rcu_read_unlock(); SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); SEQ_printf(m, diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 99aa31acc54..f604dae7131 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) } /* - * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in - * that it favours >=0 over <0. - * - * -20 | - * | - * 0 --------+------- - * .' - * 19 .' - * - */ -static unsigned long -calc_delta_asym(unsigned long delta, struct sched_entity *se) -{ - struct load_weight lw = { - .weight = NICE_0_LOAD, - .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) - }; - - for_each_sched_entity(se) { - struct load_weight *se_lw = &se->load; - unsigned long rw = cfs_rq_of(se)->load.weight; - -#ifdef CONFIG_FAIR_SCHED_GROUP - struct cfs_rq *cfs_rq = se->my_q; - struct task_group *tg = NULL - - if (cfs_rq) - tg = cfs_rq->tg; - - if (tg && tg->shares < NICE_0_LOAD) { - /* - * scale shares to what it would have been had - * tg->weight been NICE_0_LOAD: - * - * weight = 1024 * shares / tg->weight - */ - lw.weight *= se->load.weight; - lw.weight /= tg->shares; - - lw.inv_weight = 0; - - se_lw = &lw; - rw += lw.weight - se->load.weight; - } else -#endif - - if (se->load.weight < NICE_0_LOAD) { - se_lw = &lw; - rw += NICE_0_LOAD - se->load.weight; - } - - delta = calc_delta_mine(delta, rw, se_lw); - } - - return delta; -} - -/* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ @@ -587,11 +529,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } cfs_rq->nr_running++; se->on_rq = 1; - list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -600,11 +543,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); + list_del_init(&se->group_node); + } cfs_rq->nr_running--; se->on_rq = 0; - list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1086,7 +1030,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - long more_w; if (!tg->parent) return wl; @@ -1098,18 +1041,17 @@ static long effective_load(struct task_group *tg, int cpu, if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; - for_each_sched_entity(se) { -#define D(n) (likely(n) ? (n) : 1) - long S, rw, s, a, b; + long more_w; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; S = se->my_q->tg->shares; s = se->my_q->shares; @@ -1118,7 +1060,11 @@ static long effective_load(struct task_group *tg, int cpu, a = S*(rw + wl); b = S*rw + s*wg; - wl = s*(a-b)/D(b); + wl = s*(a-b); + + if (likely(b)) + wl /= b; + /* * Assume the group is already running and will * thus already be accounted for in the weight. @@ -1127,7 +1073,6 @@ static long effective_load(struct task_group *tg, int cpu, * alter the group weight. */ wg = 0; -#undef D } return wl; @@ -1144,7 +1089,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif static int -wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, +wake_affine(struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) @@ -1159,6 +1104,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (!sync && sched_feat(SYNC_WAKEUPS) && + curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1183,17 +1133,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced) { - if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - return 1; - } + if (sync && balanced) + return 1; schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - balanced) { + if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= + tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1212,16 +1159,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync) struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; - struct rq *rq, *this_rq; + struct rq *this_rq; unsigned int imbalance; int idx; prev_cpu = task_cpu(p); - rq = task_rq(p); this_cpu = smp_processor_id(); this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; + if (prev_cpu == this_cpu) + goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1249,13 +1197,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) return this_cpu; - if (prev_cpu == this_cpu) - goto out; - /* * Start passive balancing when half the imbalance_pct * limit is reached. @@ -1282,62 +1227,20 @@ static unsigned long wakeup_gran(struct sched_entity *se) * + nice tasks. */ if (sched_feat(ASYM_GRAN)) - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); - else - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); + gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); return gran; } /* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff < 0) - return -1; - - gran = wakeup_gran(curr); - if (vdiff > gran) - return 1; - - return 0; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - -/* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - int se_depth, pse_depth; + s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1352,6 +1255,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) cfs_rq_of(pse)->next = pse; /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + */ + if (test_tsk_need_resched(curr)) + return; + + /* * Batch tasks do not preempt (their preemption is driven by * the tick): */ @@ -1361,33 +1271,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(se); - pse_depth = depth_se(pse); - - while (se_depth > pse_depth) { - se_depth--; - se = parent_entity(se); - } - - while (pse_depth > se_depth) { - pse_depth--; - pse = parent_entity(pse); - } - - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { + resched_task(curr); + return; } - if (wakeup_preempt_entity(se, pse) == 1) + delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (delta_exec > wakeup_gran(pse)) resched_task(curr); } @@ -1446,19 +1338,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) if (next == &cfs_rq->tasks) return NULL; - /* Skip over entities that are not tasks */ - do { - se = list_entry(next, struct sched_entity, group_node); - next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - - if (next == &cfs_rq->tasks) - return NULL; - - cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); + se = list_entry(next, struct sched_entity, group_node); + p = task_of(se); + cfs_rq->balance_iterator = next->next; return p; } @@ -1508,7 +1390,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); update_h_load(busiest_cpu); - list_for_each_entry(tg, &task_groups, list) { + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; @@ -1621,10 +1503,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); + resched_task(rq->curr); } enqueue_task_fair(rq, p, 0); - resched_task(rq->curr); } /* @@ -1643,7 +1525,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* @@ -1660,7 +1542,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 9353ca78154..7c9e8f4a049 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 0) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3a4f92dbbe6..dec4ccabe2f 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) { resched_task(rq->idle); } @@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, @@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8375e69af36..b446dc87494 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se = rt_rq->rt_se; - if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; - - enqueue_rt_entity(rt_se); + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se); if (rt_rq->highest_prio < curr->prio) resched_task(curr); } @@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + /* + * Either all rqs have inf runtime and there's nothing to steal + * or __disable_runtime() below sets a specific rq to inf to + * indicate its been disabled and disalow stealing. + */ if (iter->rt_runtime == RUNTIME_INF) goto next; + /* + * From runqueues with spare time, take 1/n part of their + * spare time, but no more than our period. + */ diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { diff = div_u64((u64)diff, weight); @@ -274,6 +286,9 @@ next: return more; } +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; @@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq) spin_lock(&rt_b->rt_runtime_lock); spin_lock(&rt_rq->rt_runtime_lock); + /* + * Either we're all inf and nobody needs to borrow, or we're + * already disabled and thus have nothing to do, or we have + * exactly the right amount of runtime to take out. + */ if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; spin_unlock(&rt_rq->rt_runtime_lock); + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ want = rt_b->rt_runtime - rt_rq->rt_runtime; + /* + * Greedy reclaim, take back as much as we can. + */ for_each_cpu_mask(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; + /* + * Can't reclaim from ourselves or disabled runqueues. + */ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; @@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq) } spin_lock(&rt_rq->rt_runtime_lock); + /* + * We cannot be left wanting - that would mean some runtime + * leaked out of the system. + */ BUG_ON(want); balanced: + /* + * Disable all the borrow logic by pretending we have inf + * runtime - in which case borrowing doesn't make sense. + */ rt_rq->rt_runtime = RUNTIME_INF; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); @@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq) if (unlikely(!scheduler_running)) return; + /* + * Reset each runqueue's bandwidth settings + */ for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -350,6 +392,7 @@ static void __enable_runtime(struct rq *rq) spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); } @@ -388,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int i, idle = 1; cpumask_t span; - if (rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@ -488,6 +531,9 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + if (!rt_bandwidth_enabled()) + return; + for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); @@ -785,7 +831,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); diff --git a/kernel/softirq.c b/kernel/softirq.c index c506f266a6b..7110daeb9a9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,6 +6,8 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. */ #include <linux/module.h> @@ -46,7 +48,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; EXPORT_SYMBOL(irq_stat); #endif -static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; +static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); @@ -205,7 +207,18 @@ restart: do { if (pending & 1) { + int prev_count = preempt_count(); + h->action(h); + + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR "huh, entered softirq %td %p" + "with preempt_count %08x," + " exited with %08x?\n", h - softirq_vec, + h->action, prev_count, preempt_count()); + preempt_count() = prev_count; + } + rcu_bh_qsctr_inc(cpu); } h++; @@ -254,16 +267,12 @@ asmlinkage void do_softirq(void) */ void irq_enter(void) { -#ifdef CONFIG_NO_HZ int cpu = smp_processor_id(); + if (idle_cpu(cpu) && !in_interrupt()) - tick_nohz_stop_idle(cpu); -#endif + tick_check_idle(cpu); + __irq_enter(); -#ifdef CONFIG_NO_HZ - if (idle_cpu(cpu)) - tick_nohz_update_jiffies(); -#endif } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -463,17 +472,144 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ + struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + + list_add_tail(&cp->list, head); + + /* Trigger the softirq only if the list was previously empty. */ + if (head->next == &cp->list) + raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ + struct call_single_data *cp = data; + unsigned long flags; + int softirq; + + softirq = cp->priv; + + local_irq_save(flags); + __local_trigger(cp, softirq); + local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + if (cpu_online(cpu)) { + cp->func = remote_softirq_receive; + cp->info = cp; + cp->flags = 0; + cp->priv = softirq; + + __smp_call_function_single(cpu, cp); + return 0; + } + return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu. If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ + if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) + __local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + unsigned long flags; + int this_cpu; + + local_irq_save(flags); + this_cpu = smp_processor_id(); + __send_remote_softirq(cp, cpu, this_cpu, softirq); + local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + int i; + + local_irq_disable(); + for (i = 0; i < NR_SOFTIRQS; i++) { + struct list_head *head = &per_cpu(softirq_work_list[i], cpu); + struct list_head *local_head; + + if (list_empty(head)) + continue; + + local_head = &__get_cpu_var(softirq_work_list[i]); + list_splice_init(head, local_head); + raise_softirq_irqoff(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { + .notifier_call = remote_softirq_cpu_notify, +}; + void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { + int i; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + for (i = 0; i < NR_SOFTIRQS; i++) + INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } + register_hotcpu_notifier(&remote_softirq_cpu_notifier); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } diff --git a/kernel/softlockup.c b/kernel/softlockup.c index cb838ee93a8..3953e4aed73 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu) * If the system crashed already then all bets are off, * do not report extra hung tasks: */ - if ((tainted & TAINT_DIE) || did_panic) + if (test_taint(TAINT_DIE) || did_panic) return; read_lock(&tasklist_lock); diff --git a/kernel/sys.c b/kernel/sys.c index d046a7a055c..53879cdae48 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1050,9 +1050,7 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(sid); - spin_lock(&group_leader->sighand->siglock); - group_leader->signal->tty = NULL; - spin_unlock(&group_leader->sighand->siglock); + proc_clear_tty(group_leader); err = session; out: @@ -1341,8 +1339,10 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->nodename, tmp, len); - utsname()->nodename[len] = 0; + struct new_utsname *u = utsname(); + + memcpy(u->nodename, tmp, len); + memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } up_write(&uts_sem); @@ -1354,15 +1354,17 @@ asmlinkage long sys_sethostname(char __user *name, int len) asmlinkage long sys_gethostname(char __user *name, int len) { int i, errno; + struct new_utsname *u; if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(utsname()->nodename); + u = utsname(); + i = 1 + strlen(u->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, utsname()->nodename, i)) + if (copy_to_user(name, u->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1387,8 +1389,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->domainname, tmp, len); - utsname()->domainname[len] = 0; + struct new_utsname *u = utsname(); + + memcpy(u->domainname, tmp, len); + memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } up_write(&uts_sem); @@ -1441,14 +1445,22 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) return -EINVAL; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; + + if (resource == RLIMIT_NOFILE) { + if (new_rlim.rlim_max == RLIM_INFINITY) + new_rlim.rlim_max = sysctl_nr_open; + if (new_rlim.rlim_cur == RLIM_INFINITY) + new_rlim.rlim_cur = sysctl_nr_open; + if (new_rlim.rlim_max > sysctl_nr_open) + return -EPERM; + } + + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; retval = security_task_setrlimit(resource, &new_rlim); if (retval) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 08d6e1bb99a..a77b27b11b0 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -125,6 +125,12 @@ cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); +cond_syscall(sys_flock); +cond_syscall(sys_io_setup); +cond_syscall(sys_io_destroy); +cond_syscall(sys_io_submit); +cond_syscall(sys_io_cancel); +cond_syscall(sys_io_getevents); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 50ec0886fa3..617d41e4d6a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; -extern int maps_protect; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifdef CONFIG_RCU_TORTURE_TEST @@ -97,7 +96,7 @@ static int sixty = 60; static int neg_one = -1; #endif -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING) static int two = 2; #endif @@ -118,10 +117,8 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif -#ifdef __sparc__ -extern char reboot_command []; -extern int stop_a_enabled; -extern int scons_pwroff; +#ifdef CONFIG_SPARC +#include <asm/system.h> #endif #ifdef __hppa__ @@ -152,7 +149,7 @@ extern int max_lock_depth; #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -382,10 +379,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", - .data = &tainted, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_dointvec_taint, + .proc_handler = &proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -415,7 +411,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#ifdef __sparc__ +#ifdef CONFIG_SPARC { .ctl_name = KERN_SPARC_REBOOT, .procname = "reboot-cmd", @@ -810,16 +806,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_PROC_FS - { - .ctl_name = CTL_UNNUMBERED, - .procname = "maps_protect", - .data = &maps_protect, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif { .ctl_name = CTL_UNNUMBERED, .procname = "poweroff_cmd", @@ -1261,6 +1247,7 @@ static struct ctl_table fs_table[] = { .extra1 = &minolduid, .extra2 = &maxolduid, }, +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASES, .procname = "leases-enable", @@ -1269,6 +1256,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif #ifdef CONFIG_DNOTIFY { .ctl_name = FS_DIR_NOTIFY, @@ -1280,6 +1268,7 @@ static struct ctl_table fs_table[] = { }, #endif #ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASE_TIME, .procname = "lease-break-time", @@ -1291,6 +1280,8 @@ static struct ctl_table fs_table[] = { .extra1 = &zero, .extra2 = &two, }, +#endif +#ifdef CONFIG_AIO { .procname = "aio-nr", .data = &aio_nr, @@ -1305,6 +1296,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, @@ -1510,7 +1502,6 @@ void register_sysctl_root(struct ctl_table_root *root) /* Perform the actual read/write of a sysctl table entry. */ static int do_sysctl_strategy(struct ctl_table_root *root, struct ctl_table *table, - int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1524,8 +1515,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, return -EPERM; if (table->strategy) { - rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = table->strategy(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -1535,8 +1525,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, /* If there is no strategy routine, or if the strategy returns * zero, proceed with automatic r/w */ if (table->data && table->maxlen) { - rc = sysctl_data(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = sysctl_data(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; } @@ -1568,7 +1557,7 @@ repeat: table = table->child; goto repeat; } - error = do_sysctl_strategy(root, table, name, nlen, + error = do_sysctl_strategy(root, table, oldval, oldlenp, newval, newlen); return error; @@ -2237,49 +2226,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, NULL,NULL); } -#define OP_SET 0 -#define OP_AND 1 -#define OP_OR 2 - -static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int op = *(int *)data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - switch(op) { - case OP_SET: *valp = val; break; - case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - } - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - /* - * Taint values can only be increased + * Taint values can only be increased + * This means we can safely use a temporary. */ -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int op; + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - op = OP_OR; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i); + } + } + + return err; } struct do_proc_dointvec_minmax_conv_param { @@ -2727,7 +2706,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, */ /* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2761,7 +2740,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen, } /* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2807,7 +2786,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen, * are between the minimum and maximum values given in the arrays * table->extra1 and table->extra2, respectively. */ -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2843,7 +2822,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2877,7 +2856,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2932,35 +2911,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) return error; } -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8d53106a0a9..95ed42951e0 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -3,7 +3,6 @@ # config TICK_ONESHOT bool - default n config NO_HZ bool "Tickless System (Dynamic Ticks)" diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1876b526c77..f8d968063ce 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev, } /** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + dev->next_event.tv64 = KTIME_MAX; +} + +/** * clockevents_program_event - Reprogram the clock event device. * @expires: absolute expiry time (monotonic clock) * @@ -206,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old, if (new) { BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); - clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(new); } local_irq_restore(flags); } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 093d4acf993..9ed2eec9752 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; + /* save mult_orig on registration */ + c->mult_orig = c->mult; + spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 4c256fdb887..1ca99557e92 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 1ad46f3df6e..1a20715bfd6 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,13 +10,13 @@ #include <linux/mm.h> #include <linux/time.h> -#include <linux/timer.h> #include <linux/timex.h> #include <linux/jiffies.h> #include <linux/hrtimer.h> #include <linux/capability.h> #include <linux/math64.h> #include <linux/clocksource.h> +#include <linux/workqueue.h> #include <asm/timex.h> /* @@ -218,11 +218,11 @@ void second_overflow(void) /* Disable the cmos update - used by virtualization and embedded */ int no_sync_cmos_clock __read_mostly; -static void sync_cmos_clock(unsigned long dummy); +static void sync_cmos_clock(struct work_struct *work); -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); -static void sync_cmos_clock(unsigned long dummy) +static void sync_cmos_clock(struct work_struct *work) { struct timespec now, next; int fail = 1; @@ -258,13 +258,13 @@ static void sync_cmos_clock(unsigned long dummy) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); + schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } static void notify_cmos_timer(void) { if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + schedule_delayed_work(&sync_cmos_work, 0); } #else @@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { } int do_adjtimex(struct timex *txc) { struct timespec ts; - long save_adjust, sec; int result; - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { + /* Validate the data before disabling interrupts */ + if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ - if (txc->modes & ~ADJ_OFFSET_SS_READ) + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* if the quartz is off by more than 10% something is VERY wrong! */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + + if (txc->modes & ADJ_STATUS && time_state != TIME_OK) + hrtimer_cancel(&leap_timer); } - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - if (time_state != TIME_OK && txc->modes & ADJ_STATUS) - hrtimer_cancel(&leap_timer); getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_adjust; - /* If there are input parameters, then process them */ + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + } + txc->offset = save_adjust; + goto adj_done; + } if (txc->modes) { + long sec; + if (txc->modes & ADJ_STATUS) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { @@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TAI && txc->constant > 0) time_tai = txc->constant; - if (txc->modes & ADJ_OFFSET) { - if (txc->modes == ADJ_OFFSET_SINGLESHOT) - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - else - ntp_update_offset(txc->offset); - } + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) tick_usec = txc->tick; @@ -389,22 +396,18 @@ int do_adjtimex(struct timex *txc) ntp_update_frequency(); } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + +adj_done: result = time_state; /* mostly `TIME_OK' */ if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; - if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || - (txc->modes == ADJ_OFFSET_SS_READ)) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; - } - txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, - NTP_SCALE_SHIFT); + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2f5a38294bf..f98a1b7b16e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -235,9 +235,9 @@ static void tick_do_broadcast_on_off(void *why) case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) - clockevents_set_mode(dev, - CLOCK_EVT_MODE_SHUTDOWN); + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) + clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) tick_broadcast_force = 1; @@ -246,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; @@ -254,7 +255,7 @@ static void tick_do_broadcast_on_off(void *why) if (cpus_empty(tick_broadcast_mask)) { if (!bc_stopped) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); @@ -306,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -321,7 +322,7 @@ void tick_suspend_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -383,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) } /* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast(int cpu) +{ + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + } +} + +/* * Handle oneshot mode broadcasting */ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) @@ -576,4 +590,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index c4777193d56..df12434b43c 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -int tick_do_timer_cpu __read_mostly = -1; +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; DEFINE_SPINLOCK(tick_device_lock); /* @@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if (!tick_device_is_functional(dev)) return; - if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); } else { unsigned long seq; @@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == -1) { + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); @@ -249,7 +250,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * not give it back to the clockevents layer ! */ if (tick_is_broadcast_device(curdev)) { - clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(curdev); curdev = NULL; } clockevents_exchange_device(curdev, newdev); @@ -300,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup) if (*cpup == tick_do_timer_cpu) { int cpu = first_cpu(cpu_online_map); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -311,7 +313,7 @@ static void tick_suspend(void) unsigned long flags; spin_lock_irqsave(&tick_device_lock, flags); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(td->evtdev); spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 0ffc2918ea6..b1c05bf75ee 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ + +#define TICK_DO_TIMER_NONE -1 +#define TICK_DO_TIMER_BOOT -2 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; @@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void clockevents_shutdown(struct clock_event_device *dev); + /* * NO_HZ / high resolution timer shared code */ @@ -29,6 +35,8 @@ extern void tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast(int cpu); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -37,6 +45,8 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast(int cpu) { } # endif /* !BROADCAST */ #else /* !ONESHOT */ @@ -66,6 +76,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } +static inline int tick_broadcast_oneshot_active(void) { return 0; } #endif /* !TICK_ONESHOT */ /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a87b0468568..0581c11fe6c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include <linux/profile.h> #include <linux/sched.h> #include <linux/tick.h> +#include <linux/module.h> #include <asm/irq_regs.h> @@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now) incr * ticks); } do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); } write_sequnlock(&xtime_lock); } @@ -151,7 +155,7 @@ void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -187,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - *last_update_time = ktime_to_us(ts->idle_lastupdate); + if (!tick_nohz_enabled) + return -1; + + if (ts->idle_active) + *last_update_time = ktime_to_us(ts->idle_lastupdate); + else + *last_update_time = ktime_to_us(ktime_get()); + return ktime_to_us(ts->idle_sleeptime); } +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task @@ -221,7 +233,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) @@ -258,7 +270,7 @@ void tick_nohz_stop_sched_tick(int inidle) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - if (rcu_needs_cpu(cpu)) + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) delta_jiffies = 1; /* * Do not stop the tick, if we are only one off @@ -303,7 +315,7 @@ void tick_nohz_stop_sched_tick(int inidle) * invoked. */ if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->idle_sleeps++; @@ -365,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } +} + /** * tick_nohz_restart_sched_tick - restart the idle tick from the idle task * @@ -418,28 +456,7 @@ void tick_nohz_restart_sched_tick(void) */ ts->tick_stopped = 0; ts->idle_exittime = now; - hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) - break; - } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); - now = ktime_get(); - } + tick_nohz_restart(ts, now); local_irq_enable(); } @@ -468,7 +485,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; /* Check, if the jiffies need an update */ @@ -491,10 +508,6 @@ static void tick_nohz_handler(struct clock_event_device *dev) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return; - while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); @@ -540,6 +553,27 @@ static void tick_nohz_switch_to_nohz(void) smp_processor_id()); } +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!ts->tick_stopped) + return; + + tick_nohz_restart(ts, ktime_get()); +} + #else static inline void tick_nohz_switch_to_nohz(void) { } @@ -547,6 +581,19 @@ static inline void tick_nohz_switch_to_nohz(void) { } #endif /* NO_HZ */ /* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_check_idle(int cpu) +{ + tick_check_oneshot_broadcast(cpu); +#ifdef CONFIG_NO_HZ + tick_nohz_stop_idle(cpu); + tick_nohz_update_jiffies(); + tick_nohz_kick_tick(cpu); +#endif +} + +/* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -570,7 +617,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; #endif @@ -599,10 +646,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) profile_tick(CPU_PROFILING); } - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return HRTIMER_NORESTART; - hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; @@ -622,7 +665,7 @@ void tick_setup_sched_timer(void) */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f961c..e7acfb482a6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,27 +58,26 @@ struct clocksource *clock; #ifdef CONFIG_GENERIC_TIME /** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * clocksource_forward_now - update clock to the current time * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. */ -static inline s64 __get_nsec_offset(void) +static void clocksource_forward_now(void) { cycle_t cycle_now, cycle_delta; - s64 ns_offset; + s64 nsec; - /* read clocksource: */ cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + clock->cycle_last = cycle_now; - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); + nsec = cyc2ns(clock, cycle_delta); + timespec_add_ns(&xtime, nsec); - return ns_offset; + nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + clock->raw_time.tv_nsec += nsec; } /** @@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void) */ void getnstimeofday(struct timespec *ts) { + cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - nsecs = __get_nsec_offset(); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); } while (read_seqretry(&xtime_lock, seq)); @@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(struct timespec *tv) { + struct timespec ts_delta; unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; write_seqlock_irqsave(&xtime_lock, flags); - nsec -= __get_nsec_offset(); + clocksource_forward_now(); + + ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + xtime = *tv; - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); update_xtime_cache(0); clock->error = 0; @@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday); static void change_clocksource(void) { struct clocksource *new; - cycle_t now; - u64 nsec; new = clocksource_get_next(); if (clock == new) return; - new->cycle_last = 0; - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); + clocksource_forward_now(); - clock = new; - clock->cycle_last = now; + new->raw_time = clock->raw_time; + clock = new; + clock->cycle_last = 0; + clock->cycle_last = clocksource_read(new); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -200,11 +205,44 @@ static void change_clocksource(void) */ } #else +static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -static inline s64 __get_nsec_offset(void) { return 0; } #endif /** + * getrawmonotonic - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void getrawmonotonic(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + cycle_t cycle_now, cycle_delta; + + do { + seq = read_seqbegin(&xtime_lock); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + + *ts = clock->raw_time; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(getrawmonotonic); + + +/** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) @@ -265,8 +303,6 @@ void __init timekeeping_init(void) static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; -/* xtime offset when we went into suspend */ -static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } - /* Make sure that we have the correct xtime reference */ - timespec_add_ns(&xtime, timekeeping_suspend_nsecs); update_xtime_cache(0); /* re-base the last cycle value */ clock->cycle_last = 0; @@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - /* Get the current xtime offset */ - timekeeping_suspend_nsecs = __get_nsec_offset(); + clocksource_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -454,23 +487,29 @@ void update_wall_time(void) #else offset = clock->cycle_interval; #endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ while (offset >= clock->cycle_interval) { /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; offset -= clock->cycle_interval; + clock->cycle_last += clock->cycle_interval; + clock->xtime_nsec += clock->xtime_interval; if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; second_overflow(); } + clock->raw_time.tv_nsec += clock->raw_interval; + if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { + clock->raw_time.tv_nsec -= NSEC_PER_SEC; + clock->raw_time.tv_sec++; + } + /* accumulate error between NTP and clock interval */ clock->error += tick_length; clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); @@ -479,9 +518,12 @@ void update_wall_time(void) /* correct the clock when NTP error is too big */ clocksource_adjust(offset); - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; + /* store full nanoseconds into xtime after rounding it up and + * add the remainder to the error difference. + */ + xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); update_xtime_cache(cyc2ns(clock, offset)); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a40e20fd000..f6426911e35 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym) } static void -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) { #ifdef CONFIG_TIMER_STATS char tmp[TASK_COMM_LEN + 1]; #endif SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, timer); + print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); SEQ_printf(m, ", S:%02lx", timer->state); @@ -99,7 +100,7 @@ next_one: tmp = *timer; spin_unlock_irqrestore(&base->cpu_base->lock, flags); - print_timer(m, &tmp, i, now); + print_timer(m, timer, &tmp, i, now); next++; goto next_one; } @@ -109,6 +110,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", @@ -183,12 +185,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #ifdef CONFIG_GENERIC_CLOCKEVENTS static void -print_tickdevice(struct seq_file *m, struct tick_device *td) +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { @@ -222,7 +228,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) int cpu; #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - print_tickdevice(m, tick_get_broadcast_device()); + print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", tick_get_broadcast_mask()->bits[0]); #ifdef CONFIG_TICK_ONESHOT @@ -232,7 +238,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) SEQ_printf(m, "\n"); #endif for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu)); + print_tickdevice(m, tick_get_device(cpu), cpu); SEQ_printf(m, "\n"); } #else @@ -244,7 +250,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "Timer List Version: v0.4\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index 03bc7f1f159..56becf373c5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -978,6 +978,7 @@ void update_process_times(int user_tick) run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); + printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); } @@ -1435,9 +1436,11 @@ static void __cpuinit migrate_timers(int cpu) BUG_ON(cpu_online(cpu)); old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1452,8 +1455,7 @@ static void __cpuinit migrate_timers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index bb948e52ce2..db58fb66a13 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -202,7 +202,7 @@ static void start_stack_timer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); } diff --git a/kernel/user.c b/kernel/user.c index 865ecf57a09..39d6159fae4 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, { struct user_struct *up = container_of(kobj, struct user_struct, kobj); - return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); + return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); } static ssize_t cpu_rt_runtime_store(struct kobject *kobj, @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, unsigned long rt_runtime; int rc; - sscanf(buf, "%lu", &rt_runtime); + sscanf(buf, "%ld", &rt_runtime); rc = sched_group_set_rt_runtime(up->tg, rt_runtime); diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ab9659d269..3b34b354593 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, #ifdef CONFIG_SYSCTL_SYSCALL /* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, +static int sysctl_uts_string(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, write = newval && newlen; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen); + r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen); put_uts(table, write, uts_table.data); return r; } diff --git a/kernel/wait.c b/kernel/wait.c index c275c56cf2d..cd87131f2fc 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); @@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue_tail(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4048e92aa04..714afad4653 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -9,7 +9,7 @@ * Derived from the taskqueue/keventd code by: * * David Woodhouse <dwmw2@infradead.org> - * Andrew Morton <andrewm@uow.edu.au> + * Andrew Morton * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> * |