aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-08-25 10:04:27 +0200
committerIngo Molnar <mingo@elte.hu>2009-08-25 10:04:32 +0200
commitdaedc71836e5a398fd0cc0e12c5cb43539478485 (patch)
treec56567a92017679e57195cef992d4a5561c20e0e /kernel
parentc36ba80ea01d0aecb652c26799a912e760ce8981 (diff)
parent422bef879e84104fee6dc68ded0e371dbeb5f88e (diff)
Merge commit 'v2.6.31-rc7' into irq/core
Merge reason: move from an -rc2 base to -rc7. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c151
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c30
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c29
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/hrtimer.c110
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c82
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/lockdep_proc.c3
-rw-r--r--kernel/module.c9
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/perf_counter.c857
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/profile.c5
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcutree.c3
-rw-r--r--kernel/rtmutex.c4
-rw-r--r--kernel/sched.c61
-rw-r--r--kernel/sched_cpupri.c15
-rw-r--r--kernel/sched_fair.c45
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c64
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/clockevents.c11
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/blktrace.c13
-rw-r--r--kernel/trace/ftrace.c28
-rw-r--r--kernel/trace/ring_buffer.c15
-rw-r--r--kernel/trace/trace.c14
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_event_profile.c2
-rw-r--r--kernel/trace/trace_event_types.h3
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_filter.c20
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c11
-rw-r--r--kernel/trace/trace_output.c3
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_stack.c11
-rw-r--r--kernel/trace/trace_stat.c34
-rw-r--r--kernel/wait.c5
55 files changed, 1169 insertions, 614 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a682cdf..b6eadfe30e7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
#include <linux/hash.h>
#include <linux/namei.h>
#include <linux/smp_lock.h>
+#include <linux/pid_namespace.h>
#include <asm/atomic.h>
@@ -734,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
- * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
{
- if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+ if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
wake_up_all(&cgroup_rmdir_waitq);
}
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+ css_get(css);
+}
+
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
+{
+ cgroup_wakeup_rmdir_waiter(css->cgroup);
+ css_put(css);
+}
+
+
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
@@ -960,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
+ INIT_LIST_HEAD(&cgrp->pids_list);
init_rwsem(&cgrp->pids_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1357,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
* wake up rmdir() waiter. the rmdir should fail since the cgroup
* is no longer empty.
*/
- cgroup_wakeup_rmdir_waiters(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
return 0;
}
@@ -2201,12 +2215,30 @@ err:
return ret;
}
+/*
+ * Cache pids for all threads in the same pid namespace that are
+ * opening the same "tasks" file.
+ */
+struct cgroup_pids {
+ /* The node in cgrp->pids_list */
+ struct list_head list;
+ /* The cgroup those pids belong to */
+ struct cgroup *cgrp;
+ /* The namepsace those pids belong to */
+ struct pid_namespace *ns;
+ /* Array of process ids in the cgroup */
+ pid_t *tasks_pids;
+ /* How many files are using the this tasks_pids array */
+ int use_count;
+ /* Length of the current tasks_pids array */
+ int length;
+};
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
-
/*
* seq_file methods for the "tasks" file. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
@@ -2221,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
int index = 0, pid = *pos;
int *iter;
down_read(&cgrp->pids_mutex);
if (pid) {
- int end = cgrp->pids_length;
+ int end = cp->length;
while (index < end) {
int mid = (index + end) / 2;
- if (cgrp->tasks_pids[mid] == pid) {
+ if (cp->tasks_pids[mid] == pid) {
index = mid;
break;
- } else if (cgrp->tasks_pids[mid] <= pid)
+ } else if (cp->tasks_pids[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
- if (index >= cgrp->pids_length)
+ if (index >= cp->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
- iter = cgrp->tasks_pids + index;
+ iter = cp->tasks_pids + index;
*pos = *iter;
return iter;
}
static void cgroup_tasks_stop(struct seq_file *s, void *v)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
up_read(&cgrp->pids_mutex);
}
static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
int *p = v;
- int *end = cgrp->tasks_pids + cgrp->pids_length;
+ int *end = cp->tasks_pids + cp->length;
/*
* Advance to the next pid in the array. If this goes off the
@@ -2286,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
.show = cgroup_tasks_show,
};
-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void release_cgroup_pid_array(struct cgroup_pids *cp)
{
+ struct cgroup *cgrp = cp->cgrp;
+
down_write(&cgrp->pids_mutex);
- BUG_ON(!cgrp->pids_use_count);
- if (!--cgrp->pids_use_count) {
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = NULL;
- cgrp->pids_length = 0;
+ BUG_ON(!cp->use_count);
+ if (!--cp->use_count) {
+ list_del(&cp->list);
+ put_pid_ns(cp->ns);
+ kfree(cp->tasks_pids);
+ kfree(cp);
}
up_write(&cgrp->pids_mutex);
}
static int cgroup_tasks_release(struct inode *inode, struct file *file)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct seq_file *seq;
+ struct cgroup_pids *cp;
if (!(file->f_mode & FMODE_READ))
return 0;
- release_cgroup_pid_array(cgrp);
+ seq = file->private_data;
+ cp = seq->private;
+
+ release_cgroup_pid_array(cp);
return seq_release(inode, file);
}
@@ -2324,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct pid_namespace *ns = current->nsproxy->pid_ns;
+ struct cgroup_pids *cp;
pid_t *pidarray;
int npids;
int retval;
@@ -2350,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
* array if necessary
*/
down_write(&cgrp->pids_mutex);
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = pidarray;
- cgrp->pids_length = npids;
- cgrp->pids_use_count++;
+
+ list_for_each_entry(cp, &cgrp->pids_list, list) {
+ if (ns == cp->ns)
+ goto found;
+ }
+
+ cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+ if (!cp) {
+ up_write(&cgrp->pids_mutex);
+ kfree(pidarray);
+ return -ENOMEM;
+ }
+ cp->cgrp = cgrp;
+ cp->ns = ns;
+ get_pid_ns(ns);
+ list_add(&cp->list, &cgrp->pids_list);
+found:
+ kfree(cp->tasks_pids);
+ cp->tasks_pids = pidarray;
+ cp->length = npids;
+ cp->use_count++;
up_write(&cgrp->pids_mutex);
file->f_op = &cgroup_tasks_operations;
retval = seq_open(file, &cgroup_tasks_seq_operations);
if (retval) {
- release_cgroup_pid_array(cgrp);
+ release_cgroup_pid_array(cp);
return retval;
}
- ((struct seq_file *)file->private_data)->private = cgrp;
+ ((struct seq_file *)file->private_data)->private = cp;
return 0;
}
@@ -2696,33 +2756,42 @@ again:
mutex_unlock(&cgroup_mutex);
/*
+ * In general, subsystem has no css->refcnt after pre_destroy(). But
+ * in racy cases, subsystem may have to get css->refcnt after
+ * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
+ * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
+ * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
+ * and subsystem's reference count handling. Please see css_get/put
+ * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+ */
+ set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+
+ /*
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
ret = cgroup_call_pre_destroy(cgrp);
- if (ret)
+ if (ret) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
return ret;
+ }
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
- /*
- * css_put/get is provided for subsys to grab refcnt to css. In typical
- * case, subsystem has no reference after pre_destroy(). But, under
- * hierarchy management, some *temporal* refcnt can be hold.
- * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
- * is really busy, it should return -EBUSY at pre_destroy(). wake_up
- * is called when css_put() is called and refcnt goes down to 0.
- */
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
-
if (!cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
- schedule();
+ /*
+ * Because someone may call cgroup_wakeup_rmdir_waiter() before
+ * prepare_to_wait(), we need to check this flag.
+ */
+ if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
+ schedule();
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
if (signal_pending(current))
@@ -3294,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
- cgroup_wakeup_rmdir_waiters(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
}
rcu_read_unlock();
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 628d41f0dd5..869dc221733 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
-#include <linux/mnt_namespace.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/security.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0a..144326b7af5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
-#include <linux/mnt_namespace.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
@@ -568,18 +567,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* the value intact in a core dump, and to save the unnecessary
* trouble otherwise. Userland only wants this done for a sys_exit.
*/
- if (tsk->clear_child_tid
- && !(tsk->flags & PF_SIGNALED)
- && atomic_read(&mm->mm_users) > 1) {
- u32 __user * tidptr = tsk->clear_child_tid;
+ if (tsk->clear_child_tid) {
+ if (!(tsk->flags & PF_SIGNALED) &&
+ atomic_read(&mm->mm_users) > 1) {
+ /*
+ * We don't check the error code - if userspace has
+ * not set up a proper pointer then tough luck.
+ */
+ put_user(0, tsk->clear_child_tid);
+ sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
+ 1, NULL, NULL, 0);
+ }
tsk->clear_child_tid = NULL;
-
- /*
- * We don't check the error code - if userspace has
- * not set up a proper pointer then tough luck.
- */
- put_user(0, tidptr);
- sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
}
}
@@ -1269,6 +1268,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ perf_counter_fork(p);
return p;
bad_fork_free_pid:
@@ -1408,12 +1408,6 @@ long do_fork(unsigned long clone_flags,
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
- } else if (!(clone_flags & CLONE_VM)) {
- /*
- * vfork will do an exec which will call
- * set_task_comm()
- */
- perf_counter_fork(p);
}
audit_finish_fork(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf708..bd1d42b17cb 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
recalc_sigpending(); /* We sent fake signal, clean it up */
spin_unlock_irq(&current->sighand->siglock);
+ /* prevent accounting of that task to load */
+ current->flags |= PF_FREEZING;
+
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!frozen(current))
break;
schedule();
}
+
+ /* Remove the accounting blocker */
+ current->flags &= ~PF_FREEZING;
+
pr_debug("%s left refrigerator\n", current->comm);
__set_current_state(save);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 794c862125f..e18cfbdc719 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -247,6 +247,7 @@ again:
if (err < 0)
return err;
+ page = compound_head(page);
lock_page(page);
if (!page->mapping) {
unlock_page(page);
@@ -1009,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
* q: the futex_q
* key: the key of the requeue target futex
+ * hb: the hash_bucket of the requeue target futex
*
* During futex_requeue, with requeue_pi=1, it is possible to acquire the
* target futex if it is uncontended or via a lock steal. Set the futex_q key
* to the requeue target futex so the waiter can detect the wakeup on the right
* futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition. Must be called with the q->lock_ptr held.
+ * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
+ * to protect access to the pi_state to fixup the owner later. Must be called
+ * with both q->lock_ptr and hb->lock held.
*/
static inline
-void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+ struct futex_hash_bucket *hb)
{
drop_futex_key_refs(&q->key);
get_futex_key_refs(key);
@@ -1029,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
+ q->lock_ptr = &hb->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+
wake_up_state(q->task, TASK_NORMAL);
}
@@ -1087,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
set_waiters);
if (ret == 1)
- requeue_pi_wake_futex(top_waiter, key2);
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
return ret;
}
@@ -1246,8 +1256,15 @@ retry_private:
if (!match_futex(&this->key, &key1))
continue;
- WARN_ON(!requeue_pi && this->rt_waiter);
- WARN_ON(requeue_pi && !this->rt_waiter);
+ /*
+ * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter)) {
+ ret = -EINVAL;
+ break;
+ }
/*
* Wake nr_wake waiters. For requeue_pi, if we acquired the
@@ -1272,7 +1289,7 @@ retry_private:
this->task, 1);
if (ret == 1) {
/* We got the lock. */
- requeue_pi_wake_futex(this, &key2);
+ requeue_pi_wake_futex(this, &key2, hb2);
continue;
} else if (ret) {
/* -EDEADLK */
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee2..235716556bf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
int cmd = op & FUTEX_CMD_MASK;
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET)) {
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (get_compat_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
val2 = (int) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9002958a96e..49da79ab848 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
}
+
+/*
+ * Get the preferred target CPU for NOHZ
+ */
+static int hrtimer_get_target(int this_cpu, int pinned)
+{
+#ifdef CONFIG_NO_HZ
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
+ int preferred_cpu = get_nohz_load_balancer();
+
+ if (preferred_cpu >= 0)
+ return preferred_cpu;
+ }
+#endif
+ return this_cpu;
+}
+
+/*
+ * With HIGHRES=y we do not migrate the timer when it is expiring
+ * before the next event on the target cpu because we cannot reprogram
+ * the target cpu hardware and we would cause it to fire late.
+ *
+ * Called with cpu_base->lock of target cpu held.
+ */
+static int
+hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ ktime_t expires;
+
+ if (!new_base->cpu_base->hres_active)
+ return 0;
+
+ expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
+ return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
+#else
+ return 0;
+#endif
+}
+
/*
* Switch the timer base to the current CPU when possible.
*/
@@ -200,16 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
{
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
- int cpu, preferred_cpu = -1;
-
- cpu = smp_processor_id();
-#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
- preferred_cpu = get_nohz_load_balancer();
- if (preferred_cpu >= 0)
- cpu = preferred_cpu;
- }
-#endif
+ int this_cpu = smp_processor_id();
+ int cpu = hrtimer_get_target(this_cpu, pinned);
again:
new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -217,7 +249,7 @@ again:
if (base != new_base) {
/*
- * We are trying to schedule the timer on the local CPU.
+ * We are trying to move timer to new_base.
* However we can't change timer's base while it is running,
* so we keep it on the same CPU. No hassle vs. reprogramming
* the event source in the high resolution case. The softirq
@@ -233,38 +265,12 @@ again:
spin_unlock(&base->cpu_base->lock);
spin_lock(&new_base->cpu_base->lock);
- /* Optimized away for NOHZ=n SMP=n */
- if (cpu == preferred_cpu) {
- /* Calculate clock monotonic expiry time */
-#ifdef CONFIG_HIGH_RES_TIMERS
- ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
- new_base->offset);
-#else
- ktime_t expires = hrtimer_get_expires(timer);
-#endif
-
- /*
- * Get the next event on target cpu from the
- * clock events layer.
- * This covers the highres=off nohz=on case as well.
- */
- ktime_t next = clockevents_get_next_event(cpu);
-
- ktime_t delta = ktime_sub(expires, next);
-
- /*
- * We do not migrate the timer when it is expiring
- * before the next event on the target cpu because
- * we cannot reprogram the target cpu hardware and
- * we would cause it to fire late.
- */
- if (delta.tv64 < 0) {
- cpu = smp_processor_id();
- spin_unlock(&new_base->cpu_base->lock);
- spin_lock(&base->cpu_base->lock);
- timer->base = base;
- goto again;
- }
+ if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+ cpu = this_cpu;
+ spin_unlock(&new_base->cpu_base->lock);
+ spin_lock(&base->cpu_base->lock);
+ timer->base = base;
+ goto again;
}
timer->base = new_base;
}
@@ -1276,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
expires_next.tv64 = KTIME_MAX;
+ spin_lock(&cpu_base->lock);
+ /*
+ * We set expires_next to KTIME_MAX here with cpu_base->lock
+ * held to prevent that a timer is enqueued in our queue via
+ * the migration code. This does not affect enqueueing of
+ * timers which run their callback and need to be requeued on
+ * this CPU.
+ */
+ cpu_base->expires_next.tv64 = KTIME_MAX;
+
base = cpu_base->clock_base;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
ktime_t basenow;
struct rb_node *node;
- spin_lock(&cpu_base->lock);
-
basenow = ktime_add(now, base->offset);
while ((node = base->first)) {
@@ -1316,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
<