diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1082 |
1 files changed, 663 insertions, 419 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index deca041fc36..a646e4f36c4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -166,7 +166,7 @@ #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) -static inline unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { if (p->static_prio < NICE_TO_PRIO(0)) return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); @@ -206,7 +206,7 @@ struct runqueue { */ unsigned long nr_running; #ifdef CONFIG_SMP - unsigned long cpu_load; + unsigned long cpu_load[3]; #endif unsigned long long nr_switches; @@ -260,22 +260,86 @@ struct runqueue { static DEFINE_PER_CPU(struct runqueue, runqueues); +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ #define for_each_domain(cpu, domain) \ - for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) +for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -/* - * Default context-switch locking: - */ #ifndef prepare_arch_switch -# define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(runqueue_t *rq, task_t *p) +{ + return rq->curr == p; +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ + spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(runqueue_t *rq, task_t *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return rq->curr == p; +#endif +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; #endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* * task_rq_lock - lock the runqueue a given task resides on and disable @@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 11 +#define SCHEDSTAT_VERSION 12 static int show_schedstat(struct seq_file *seq, void *v) { @@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v) #ifdef CONFIG_SMP /* domain-specific stats */ + preempt_disable(); for_each_domain(cpu, sd) { enum idle_type itype; char mask_str[NR_CPUS]; @@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_pushed, sd->sbe_attempts, + sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } + preempt_enable(); #endif } return 0; @@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void) return rq; } -#ifdef CONFIG_SCHED_SMT -static int cpu_and_siblings_are_idle(int cpu) -{ - int sib; - for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { - if (idle_cpu(sib)) - continue; - return 0; - } - - return 1; -} -#else -#define cpu_and_siblings_are_idle(A) idle_cpu(A) -#endif - #ifdef CONFIG_SCHEDSTATS /* * Called when a process is dequeued from the active array and given @@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) rq->nr_running++; } -static void recalc_task_prio(task_t *p, unsigned long long now) +static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ unsigned long long __sleep_time = now - p->timestamp; @@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now) } } - p->prio = effective_prio(p); + return effective_prio(p); } /* @@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) } #endif - recalc_task_prio(p, now); + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -782,22 +833,12 @@ inline int task_curr(const task_t *p) } #ifdef CONFIG_SMP -enum request_type { - REQ_MOVE_TASK, - REQ_SET_DOMAIN, -}; - typedef struct { struct list_head list; - enum request_type type; - /* For REQ_MOVE_TASK */ task_t *task; int dest_cpu; - /* For REQ_SET_DOMAIN */ - struct sched_domain *sd; - struct completion done; } migration_req_t; @@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) } init_completion(&req->done); - req->type = REQ_MOVE_TASK; req->task = p; req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); @@ -886,26 +926,154 @@ void kick_process(task_t *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu) +static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; - return min(rq->cpu_load, load_now); + return min(rq->cpu_load[type-1], load_now); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long target_load(int cpu) +static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; - return max(rq->cpu_load, load_now); + return max(rq->cpu_load[type-1], load_now); } -#endif +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +{ + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; + int imbalance = 100 + (sd->imbalance_pct-100)/2; + + do { + unsigned long load, avg_load; + int local_group; + int i; + + local_group = cpu_isset(this_cpu, group->cpumask); + /* XXX: put a cpus allowed check */ + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + group = group->next; + } while (group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_queue - find the idlest runqueue among the cpus in group. + */ +static int find_idlest_cpu(struct sched_group *group, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + for_each_cpu_mask(i, group->cpumask) { + load = source_load(i, 0); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; + } + } + + return idlest; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int sched_balance_self(int cpu, int flag) +{ + struct task_struct *t = current; + struct sched_domain *tmp, *sd = NULL; + + for_each_domain(cpu, tmp) + if (tmp->flags & flag) + sd = tmp; + + while (sd) { + cpumask_t span; + struct sched_group *group; + int new_cpu; + int weight; + + span = sd->span; + group = find_idlest_group(sd, t, cpu); + if (!group) + goto nextlevel; + + new_cpu = find_idlest_cpu(group, cpu); + if (new_cpu == -1 || new_cpu == cpu) + goto nextlevel; + + /* Now try balancing at a lower domain level */ + cpu = new_cpu; +nextlevel: + sd = NULL; + weight = cpus_weight(span); + for_each_domain(cpu, tmp) { + if (weight <= cpus_weight(tmp->span)) + break; + if (tmp->flags & flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return cpu; +} + +#endif /* CONFIG_SMP */ /* * wake_idle() will wake a task on an idle cpu if task->cpu is @@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p) for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_IDLE) { - cpus_and(tmp, sd->span, cpu_online_map); - cpus_and(tmp, tmp, p->cpus_allowed); + cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) return i; } } - else break; + else + break; } return cpu; } @@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) runqueue_t *rq; #ifdef CONFIG_SMP unsigned long load, this_load; - struct sched_domain *sd; + struct sched_domain *sd, *this_sd = NULL; int new_cpu; #endif @@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) if (unlikely(task_running(rq, p))) goto out_activate; -#ifdef CONFIG_SCHEDSTATS + new_cpu = cpu; + schedstat_inc(rq, ttwu_cnt); if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); - } else { - for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } + goto out_set_cpu; + } + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + this_sd = sd; + break; } } -#endif - new_cpu = cpu; - if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; - load = source_load(cpu); - this_load = target_load(this_cpu); - /* - * If sync wakeup then subtract the (maximum possible) effect of - * the currently running task from the load of the current CPU: + * Check for affine wakeup and passive balancing possibilities. */ - if (sync) - this_load -= SCHED_LOAD_SCALE; + if (this_sd) { + int idx = this_sd->wake_idx; + unsigned int imbalance; - /* Don't pull the task off an idle CPU to a busy one */ - if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) - goto out_set_cpu; + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - new_cpu = this_cpu; /* Wake to this CPU if we can */ + load = source_load(cpu, idx); + this_load = target_load(this_cpu, idx); - /* - * Scan domains for affine wakeup and passive balancing - * possibilities. - */ - for_each_domain(this_cpu, sd) { - unsigned int imbalance; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + new_cpu = this_cpu; /* Wake to this CPU if we can */ - if ((sd->flags & SD_WAKE_AFFINE) && - !task_hot(p, rq->timestamp_last_tick, sd)) { + if (this_sd->flags & SD_WAKE_AFFINE) { + unsigned long tl = this_load; /* - * This domain has SD_WAKE_AFFINE and p is cache cold - * in this domain. + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_move_affine); + if (sync) + tl -= SCHED_LOAD_SCALE; + + if ((tl <= load && + tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || + 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); goto out_set_cpu; } - } else if ((sd->flags & SD_WAKE_BALANCE) && - imbalance*this_load <= 100*load) { - /* - * This domain has SD_WAKE_BALANCE and there is - * an imbalance. - */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_move_balance); + } + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); goto out_set_cpu; } } @@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state) return try_to_wake_up(p, state, 0); } -#ifdef CONFIG_SMP -static int find_idlest_cpu(struct task_struct *p, int this_cpu, - struct sched_domain *sd); -#endif - /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void fastcall sched_fork(task_t *p) +void fastcall sched_fork(task_t *p, int clone_flags) { + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + set_task_cpu(p, cpu); + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p) p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); p->array = NULL; - spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + p->oncpu = 0; +#endif #ifdef CONFIG_PREEMPT - /* - * During context-switch we hold precisely one spinlock, which - * schedule_tail drops. (in the common case it's this_rq()->lock, - * but it also can be p->switch_lock.) So we compensate with a count - * of 1. Also, we want to start with kernel preemption disabled. - */ + /* Want to start with kernel preemption disabled. */ p->thread_info->preempt_count = 1; #endif /* @@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p) * runqueue lock is not a problem. */ current->time_slice = 1; - preempt_disable(); scheduler_tick(); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); + } + local_irq_enable(); + put_cpu(); } /* @@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) runqueue_t *rq, *this_rq; rq = task_rq_lock(p, &flags); - cpu = task_cpu(p); - this_cpu = smp_processor_id(); - BUG_ON(p->state != TASK_RUNNING); + this_cpu = smp_processor_id(); + cpu = task_cpu(p); /* * We decrease the sleep average of forking parents @@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p) } /** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void prepare_task_switch(runqueue_t *rq, task_t *next) +{ + prepare_lock_switch(rq, next); + prepare_arch_switch(next); +} + +/** * finish_task_switch - clean up after a task-switch * @prev: the thread we just switched away from. * - * We enter this with the runqueue still locked, and finish_arch_switch() - * will unlock it along with doing any other architecture-specific cleanup - * actions. + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. * * Note that we may have delayed dropping an mm in context_switch(). If * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(task_t *prev) +static inline void finish_task_switch(runqueue_t *rq, task_t *prev) __releases(rq->lock) { - runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; unsigned long prev_task_flags; @@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_task_flags = prev->flags; - finish_arch_switch(rq, prev); + finish_arch_switch(prev); + finish_lock_switch(rq, prev); if (mm) mmdrop(mm); if (unlikely(prev_task_flags & PF_DEAD)) @@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev) asmlinkage void schedule_tail(task_t *prev) __releases(rq->lock) { - finish_task_switch(prev); - + runqueue_t *rq = this_rq(); + finish_task_switch(rq, prev); +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif if (current->set_child_tid) put_user(current->pid, current->set_child_tid); } @@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) } /* - * find_idlest_cpu - find the least busy runqueue. - */ -static int find_idlest_cpu(struct task_struct *p, int this_cpu, - struct sched_domain *sd) -{ - unsigned long load, min_load, this_load; - int i, min_cpu; - cpumask_t mask; - - min_cpu = UINT_MAX; - min_load = ULONG_MAX; - - cpus_and(mask, sd->span, p->cpus_allowed); - - for_each_cpu_mask(i, mask) { - load = target_load(i); - - if (load < min_load) { - min_cpu = i; - min_load = load; - - /* break out early on an idle CPU: */ - if (!min_load) - break; - } - } - - /* add +1 to account for the new task */ - this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; - - /* - * Would with the addition of the new task to the - * current CPU there be an imbalance between this - * CPU and the idlest CPU? - * - * Use half of the balancing threshold - new-context is - * a good opportunity to balance. - */ - if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) - return min_cpu; - - return this_cpu; -} - -/* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then @@ -1571,37 +1712,16 @@ out: } /* - * sched_exec(): find the highest-level, exec-balance-capable - * domain and try to migrate the task to the least loaded CPU. - * - * execve() is a valuable balancing opportunity, because at this point - * the task has the smallest effective memory and cache footprint. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ void sched_exec(void) { - struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); - - /* Prefer the current CPU if there's only this task running */ - if (this_rq()->nr_running <= 1) - goto out; - - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_BALANCE_EXEC) - sd = tmp; - - if (sd) { - schedstat_inc(sd, sbe_attempts); - new_cpu = find_idlest_cpu(current, this_cpu, sd); - if (new_cpu != this_cpu) { - schedstat_inc(sd, sbe_pushed); - put_cpu(); - sched_migrate_task(current, new_cpu); - return; - } - } -out: + new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); put_cpu(); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); } /* @@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, int *all_pinned) { /* * We do not migrate tasks that are: @@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (task_running(rq, p)) - return 0; if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; + *all_pinned = 0; + + if (task_running(rq, p)) + return 0; /* * Aggressive migration if: - * 1) the [whole] cpu is idle, or + * 1) task is cache cold, or * 2) too many balance attempts have failed. */ - if (cpu_and_siblings_are_idle(this_cpu) || \ - sd->nr_balance_failed > sd->cache_nice_tries) + if (sd->nr_balance_failed > sd->cache_nice_tries) return 1; if (task_hot(p, rq->timestamp_last_tick, sd)) - return 0; + return 0; return 1; } @@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) + enum idle_type idle, int *all_pinned) { prio_array_t *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0; + int idx, pulled = 0, pinned = 0; task_t *tmp; - if (max_nr_move <= 0 || busiest->nr_running <= 1) + if (max_nr_move == 0) goto out; + pinned = 1; + /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1717,7 +1840,7 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { if (curr != head) goto skip_queue; idx++; @@ -1746,6 +1869,9 @@ out: * inside pull_task(). */ schedstat_add(sd, lb_gained[idle], pulled); + + if (all_pinned) + *all_pinned = pinned; return pulled; } @@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; + int load_idx; max_load = this_load = total_load = total_pwr = 0; + if (idle == NOT_IDLE) + load_idx = sd->busy_idx; + else if (idle == NEWLY_IDLE) + load_idx = sd->newidle_idx; + else + load_idx = sd->idle_idx; do { unsigned long load; @@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, for_each_cpu_mask(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i); + load = target_load(i, load_idx); else - load = source_load(i); + load = source_load(i, load_idx); avg_load += load; } @@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (local_group) { this_load = avg_load; this = group; - goto nextgroup; } else if (avg_load > max_load) { max_load = avg_load; busiest = group; } -nextgroup: group = group->next; } while (group != sd->groups); @@ -1870,15 +2001,9 @@ nextgroup: /* Get rid of the scaling factor, rounding down as we divide */ *imbalance = *imbalance / SCHED_LOAD_SCALE; - return busiest; out_balanced: - if (busiest && (idle == NEWLY_IDLE || - (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { - *imbalance = 1; - return busiest; - } *imbalance = 0; return NULL; @@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i); + load = source_load(i, 0); if (load > max_load) { max_load = load; @@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) } /* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL 512 + +/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * @@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, struct sched_group *group; runqueue_t *busiest; unsigned long imbalance; - int nr_moved; + int nr_moved, all_pinned = 0; + int active_balance = 0; spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); @@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } + BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], imbalance); @@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle); + imbalance, sd, idle, + &all_pinned); spin_unlock(&busiest->lock); + + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(all_pinned)) + goto out_balanced; } + spin_unlock(&this_rq->lock); if (!nr_moved) { @@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->nr_balance_failed++; if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - int wake = 0; spin_lock(&busiest->lock); if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; - wake = 1; + active_balance = 1; } spin_unlock(&busiest->lock); - if (wake) + if (active_balance) wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ - sd->nr_balance_failed = sd->cache_nice_tries; + sd->nr_balance_failed = sd->cache_nice_tries+1; } - - /* - * We were unbalanced, but unsuccessful in move_tasks(), - * so bump the balance_interval to lessen the lock contention. - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval++; - } else { + } else sd->nr_balance_failed = 0; + if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; + } else { + /* + * If we've begun active balancing, start to back off. This + * case may not be covered by the all_pinned logic if there + * is only 1 task on the busy runqueue (because we don't call + * move_tasks). + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; } return nr_moved; @@ -2005,8 +2137,10 @@ out_balanced: schedstat_inc(sd, lb_balanced[idle]); + sd->nr_balance_failed = 0; /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; return 0; @@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) { - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); - goto out; + goto out_balanced; } busiest = find_busiest_queue(group); - if (!busiest || busiest == this_rq) { - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); - goto out; + goto out_balanced; } + BUG_ON(busiest == this_rq); + /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, NEWLY_IDLE); + imbalance, sd, NEWLY_IDLE, NULL); if (!nr_moved) schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + else + sd->nr_balance_failed = 0; spin_unlock(&busiest->lock); - -out: return nr_moved; + +out_balanced: + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + sd->nr_balance_failed = 0; + return 0; } /* @@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) { struct sched_domain *sd; - struct sched_group *cpu_group; runqueue_t *target_rq; - cpumask_t visited_cpus; - int cpu; + int target_cpu = busiest_rq->push_cpu; + + if (busiest_rq->nr_running <= 1) + /* no task to move */ + return; + + target_rq = cpu_rq(target_cpu); /* - * Search for suitable CPUs to push tasks to in successively higher - * domains with SD_LOAD_BALANCE set. + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. */ - visited_cpus = CPU_MASK_NONE; - for_each_domain(busiest_cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - /* no more domains to search */ - break; + BUG_ON(busiest_rq == target_rq); - schedstat_inc(sd, alb_cnt); + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); - cpu_group = sd->groups; - do { - for_each_cpu_mask(cpu, cpu_group->cpumask) { - if (busiest_rq->nr_running <= 1) - /* no more tasks left to move */ - return; - if (cpu_isset(cpu, visited_cpus)) - continue; - cpu_set(cpu, visited_cpus); - if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) - continue; - - target_rq = cpu_rq(cpu); - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - if (move_tasks(target_rq, cpu, busiest_rq, - 1, sd, SCHED_IDLE)) { - schedstat_inc(sd, alb_pushed); - } else { - schedstat_inc(sd, alb_failed); - } - spin_unlock(&target_rq->lock); - } - cpu_group = cpu_group->next; - } while (cpu_group != sd->groups); - } + /* Search for an sd spanning us and the target CPU. */ + for_each_domain(target_cpu, sd) + if ((sd->flags & SD_LOAD_BALANCE) && + cpu_isset(busiest_cpu, sd->span)) + break; + + if (unlikely(sd == NULL)) + goto out; + + schedstat_inc(sd, alb_cnt); + + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); +out: + spin_unlock(&target_rq->lock); } /* @@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; + int i; - /* Update our load */ - old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (this_load > old_load) - old_load++; - this_rq->cpu_load = (old_load + this_load) / 2; + /* Update our load */ + for (i = 0; i < 3; i++) { + unsigned long new_load = this_load; + int scale = 1 << i; + old_load = this_rq->cpu_load[i]; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + |