diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 13:23:18 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-11 13:23:18 -0700 |
commit | 774a694f8cd08115d130a290d73c6d8563f26b1b (patch) | |
tree | 2b5f834ac7a149278d2a7e44d7afe69f40ef1431 /kernel | |
parent | 4f0ac854167846bd55cd81dbc9a36e03708aa01c (diff) | |
parent | e1f8450854d69f0291882804406ea1bab3ca44b4 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (64 commits)
sched: Fix sched::sched_stat_wait tracepoint field
sched: Disable NEW_FAIR_SLEEPERS for now
sched: Keep kthreads at default priority
sched: Re-tune the scheduler latency defaults to decrease worst-case latencies
sched: Turn off child_runs_first
sched: Ensure that a child can't gain time over it's parent after fork()
sched: enable SD_WAKE_IDLE
sched: Deal with low-load in wake_affine()
sched: Remove short cut from select_task_rq_fair()
sched: Turn on SD_BALANCE_NEWIDLE
sched: Clean up topology.h
sched: Fix dynamic power-balancing crash
sched: Remove reciprocal for cpu_power
sched: Try to deal with low capacity, fix update_sd_power_savings_stats()
sched: Try to deal with low capacity
sched: Scale down cpu_power due to RT tasks
sched: Implement dynamic cpu_power
sched: Add smt_gain
sched: Update the cpu_power sum during load-balance
sched: Add SD_PREFER_SIBLING
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/kthread.c | 4 | ||||
-rw-r--r-- | kernel/sched.c | 1099 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 30 | ||||
-rw-r--r-- | kernel/sched_debug.c | 4 | ||||
-rw-r--r-- | kernel/sched_fair.c | 84 | ||||
-rw-r--r-- | kernel/sched_features.h | 2 | ||||
-rw-r--r-- | kernel/sched_rt.c | 62 | ||||
-rw-r--r-- | kernel/sysctl.c | 24 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
9 files changed, 826 insertions, 485 deletions
diff --git a/kernel/kthread.c b/kernel/kthread.c index eb8751aa041..5fe709982ca 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,8 +16,6 @@ #include <linux/mutex.h> #include <trace/events/sched.h> -#define KTHREAD_NICE_LEVEL (-5) - static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; @@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); - set_user_nice(create.result, KTHREAD_NICE_LEVEL); set_cpus_allowed_ptr(create.result, cpu_all_mask); } return create.result; @@ -221,7 +218,6 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_user_nice(tsk, KTHREAD_NICE_LEVEL); set_cpus_allowed_ptr(tsk, cpu_all_mask); set_mems_allowed(node_possible_map); diff --git a/kernel/sched.c b/kernel/sched.c index 4066241ae9f..e27a53685ed 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -64,7 +64,6 @@ #include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> -#include <linux/reciprocal_div.h> #include <linux/unistd.h> #include <linux/pagemap.h> #include <linux/hrtimer.h> @@ -120,30 +119,8 @@ */ #define RUNTIME_INF ((u64)~0ULL) -#ifdef CONFIG_SMP - static void double_rq_lock(struct rq *rq1, struct rq *rq2); -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ - return reciprocal_divide(load, sg->reciprocal_cpu_power); -} - -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user) /* * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. */ struct task_group root_task_group; @@ -318,7 +295,7 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -616,6 +593,7 @@ struct rq { unsigned char idle_at_tick; /* For active balancing */ + int post_schedule; int active_balance; int push_cpu; /* cpu of this runqueue: */ @@ -626,6 +604,9 @@ struct rq { struct task_struct *migration_thread; struct list_head migration_queue; + + u64 rt_avg; + u64 age_stamp; #endif /* calc_load related fields */ @@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) inline void update_rq_clock(struct rq *rq) { @@ -861,6 +843,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; unsigned int sysctl_sched_shares_thresh = 4; /* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + +/* * period over which we measure -rt task cpu usage in us. * default: 1s */ @@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu) } #endif /* CONFIG_NO_HZ */ +static u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +static void sched_avg_update(struct rq *rq) +{ + s64 period = sched_avg_period(); + + while ((s64)(rq->clock - rq->age_stamp) > period) { + rq->age_stamp += period; + rq->rt_avg /= 2; + } +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} + #else /* !CONFIG_SMP */ static void resched_task(struct task_struct *p) { assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +} #endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 @@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED +struct update_shares_data { + unsigned long rq_weight[NR_CPUS]; +}; + +static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* * Calculate and set the cpu's group shares. */ -static void -update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) +static void update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, + unsigned long sd_rq_weight, + struct update_shares_data *usd) { - unsigned long shares; - unsigned long rq_weight; + unsigned long shares, rq_weight; + int boost = 0; - if (!tg->se[cpu]) - return; - - rq_weight = tg->cfs_rq[cpu]->rq_weight; + rq_weight = usd->rq_weight[cpu]; + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * + * \Sum_j shares_j * rq_weight_i + * shares_i = ----------------------------- + * \Sum_j rq_weight_j */ shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); @@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->shares = shares; - + tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); } @@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0; - unsigned long shares = 0; + unsigned long weight, rq_weight = 0, shares = 0; + struct update_shares_data *usd; struct sched_domain *sd = data; + unsigned long flags; int i; + if (!tg->se[0]) + return 0; + + local_irq_save(flags); + usd = &__get_cpu_var(update_shares_data); + for_each_cpu(i, sched_domain_span(sd)) { + weight = tg->cfs_rq[i]->load.weight; + usd->rq_weight[i] = weight; + /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to * run here it will not get delayed by group starvation. */ - weight = tg->cfs_rq[i]->load.weight; if (!weight) weight = NICE_0_LOAD; - tg->cfs_rq[i]->rq_weight = weight; rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight); + update_group_shares_cpu(tg, i, shares, rq_weight, usd); + + local_irq_restore(flags); return 0; } @@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_shares(struct sched_domain *sd) { - u64 now = cpu_clock(raw_smp_processor_id()); - s64 elapsed = now - sd->last_update; + s64 elapsed; + u64 now; + + if (root_task_group_empty()) + return; + + now = cpu_clock(raw_smp_processor_id()); + elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; @@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd) static void update_shares_locked(struct rq *rq, struct sched_domain *sd) { + if (root_task_group_empty()) + return; + spin_unlock(&rq->lock); update_shares(sd); spin_lock(&rq->lock); @@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) static void update_h_load(long cpu) { + if (root_task_group_empty()) + return; + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } @@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) } /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; @@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Make sure we do not leak PI boosting priority to the child: + * Make sure we do not leak PI boosting priority to the child. */ p->prio = current->normal_prio; + + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + p->policy = SCHED_NORMAL; + + if (p->normal_prio < DEFAULT_PRIO) + p->prio = DEFAULT_PRIO; + + if (PRIO_TO_NICE(p->static_prio) < 0) { + p->static_prio = NICE_TO_PRIO(0); + set_load_weight(p); + } + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; @@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) { struct mm_struct *mm = rq->prev_mm; long prev_state; -#ifdef CONFIG_SMP - int post_schedule = 0; - - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif rq->prev_mm = NULL; @@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif fire_sched_in_preempt_notifiers(current); if (mm) @@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } } +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ + if (rq->post_schedule) { + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + if (rq->curr->sched_class->post_schedule) + rq->curr->sched_class->post_schedule(rq); + spin_unlock_irqrestore(&rq->lock, flags); + + rq->post_schedule = 0; + } +} + +#else + +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ +} + +#endif + /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) struct rq *rq = this_rq(); finish_task_switch(rq, prev); + + /* + * FIXME: do we need to worry about rq being invalidated by the + * task_switch? + */ + post_schedule(rq); + #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); @@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, { const struct sched_class *class; - for (class = sched_class_highest; class; class = class->next) + for_each_class(class) { if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) return 1; + } return 0; } @@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sgs->sum_nr_running > sgs->group_capacity - 1) + if (sgs->sum_nr_running + 1 > sgs->group_capacity) return; if (sgs->sum_nr_running > sds->leader_nr_running || @@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long smt_gain = sd->smt_gain; + + smt_gain /= weight; + + return smt_gain; +} + +unsigned long scale_rt_power(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 total, available; + + sched_avg_update(rq); + + total = sched_avg_period() + (rq->clock - rq->age_stamp); + available = total - rq->rt_avg; + + if (unlikely((s64)total < SCHED_LOAD_SCALE)) + total = SCHED_LOAD_SCALE; + + total >>= SCHED_LOAD_SHIFT; + + return div_u64(available, total); +} + +static void update_cpu_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long power = SCHED_LOAD_SCALE; + struct sched_group *sdg = sd->groups; + + /* here we could scale based on cpufreq */ + + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + power *= arch_scale_smt_power(sd, cpu); + power >>= SCHED_LOAD_SHIFT; + } + + power *= scale_rt_power(cpu); + power >>= SCHED_LOAD_SHIFT; + + if (!power) + power = 1; + + sdg->cpu_power = power; +} + +static void update_group_power(struct sched_domain *sd, int cpu) +{ + struct sched_domain *child = sd->child; + struct sched_group *group, *sdg = sd->groups; + unsigned long power; + + if (!child) { + update_cpu_power(sd, cpu); + return; + } + + power = 0; + + group = child->groups; + do { + power += group->cpu_power; + group = group->next; + } while (group != child->groups); + + sdg->cpu_power = power; +} /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. @@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ -static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, +static inline void update_sg_lb_stats(struct sched_domain *sd, + struct sched_group *group, int this_cpu, enum cpu_idle_type idle, int load_idx, int *sd_idle, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) @@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - if (local_group) + if (local_group) { balance_cpu = group_first_cpu(group); + if (balance_cpu == this_cpu) + update_group_power(sd, this_cpu); + } /* Tally up the load of all CPUs in the group */ sum_avg_load_per_task = avg_load_per_task = 0; @@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = sg_div_cpu_power(group, - sgs->group_load * SCHED_LOAD_SCALE); + sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; /* @@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, * normalized nr_running number somewhere that negates * the hierarchy? */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); + avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / + group->cpu_power; if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; - sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - + sgs->group_capacity = + DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); } /** @@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, const struct cpumask *cpus, int *balance, struct sd_lb_stats *sds) { + struct sched_domain *child = sd->child; struct sched_group *group = sd->groups; struct sg_lb_stats sgs; - int load_idx; + int load_idx, prefer_sibling = 0; + + if (child && child->flags & SD_PREFER_SIBLING) + prefer_sibling = 1; init_sd_power_savings_stats(sd, sds, idle); load_idx = get_sd_load_idx(sd, idle); @@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && balance && !(*balance)) return; sds->total_load += sgs.group_load; - sds->total_pwr += group->__cpu_power; + sds->total_pwr += group->cpu_power; + + /* + * In case the child domain prefers tasks go to siblings + * first, lower the group capacity to one so that we'll try + * and move all the excess tasks away. + */ + if (prefer_sibling) + sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { sds->this_load = sgs.avg_load; @@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, update_sd_power_savings_stats(group, sds, local_group, &sgs); group = group->next; } while (group != sd->groups); - } /** @@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->__cpu_power * + pwr_now += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->__cpu_power * + pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(sds->busiest, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->busiest->cpu_power; if (sds->max_load > tmp) - pwr_move += sds->busiest->__cpu_power * + pwr_move += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->__cpu_power < + if (sds->max_load * sds->busiest->cpu_power < sds->busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(sds->this, - sds->max_load * sds->busiest->__cpu_power); + tmp = (sds->max_load * sds->busiest->cpu_power) / + sds->this->cpu_power; else - tmp = sg_div_cpu_power(sds->this, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += sds->this->__cpu_power * + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->this->cpu_power; + pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; @@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, sds->max_load - sds->busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->__cpu_power, - (sds->avg_load - sds->this_load) * sds->this->__cpu_power) + *imbalance = min(max_pull * sds->busiest->cpu_power, + (sds->avg_load - sds->this_load) * sds->this->cpu_power) / SCHED_LOAD_SCALE; /* @@ -3976,6 +4161,26 @@ ret: return NULL; } +static struct sched_group *group_of(int cpu) +{ + struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + + if (!sd) + return NULL; + + return sd->groups; +} + +static unsigned long power_of(int cpu) +{ + struct sched_group *group = group_of(cpu); + + if (!group) + return SCHED_LOAD_SCALE; + + return group->cpu_power; +} + /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, int i; for_each_cpu(i, sched_group_cpus(group)) { + unsigned long power = power_of(i); + unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); - wl = weighted_cpuload(i); + wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; + wl /= power; - if (rq->nr_running == 1 && wl > imbalance) + if (capacity && rq->nr_running == 1 && wl > imbalance) continue; if (wl > max_load) { @@ -5349,10 +5557,7 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } -#ifdef CONFIG_SMP - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -#endif + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -5378,6 +5583,8 @@ need_resched_nonpreemptible: } else spin_unlock_irq(&rq->lock); + post_schedule(rq); + if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; @@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + int reset_on_fork; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ - if (policy < 0) + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -6177,6 +6392,10 @@ recheck: /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; } if (user) { @@ -6220,6 +6439,8 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); @@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (p) { retval = security_task_getscheduler(p); if (!retval) - retval = p->policy; + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } read_unlock(&tasklist_lock); return retval; } /** - * sys_sched_getscheduler - get the RT priority of a thread + * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ @@ -6571,19 +6793,9 @@ static inline int should_resched(void) static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - __might_sleep(__FILE__, __LINE__); -#endif - /* - * The BKS might be reacquired before we have dropped - * PREEMPT_ACTIVE, which could trigger a second - * cond_resched() call. - */ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) @@ -6597,14 +6809,14 @@ int __sched _cond_resched(void) EXPORT_SYMBOL(_cond_resched); /* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * * This works OK both with and without CONFIG_PREEMPT. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_lock(spinlock_t *lock) { int resched = should_resched(); int ret = 0; @@ -6622,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock) } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_lock); -int __sched cond_resched_softirq(void) +int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -6636,7 +6848,7 @@ int __sched cond_resched_softirq(void) } return 0; } -EXPORT_SYMBOL(cond_resched_softirq); +EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. @@ -6660,11 +6872,13 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; schedule(); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } @@ -6672,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); long ret; delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; ret = schedule_timeout(timeout); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; @@ -6994,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ + struct task_struct *mt = rq->migration_thread; + + get_task_struct(mt); task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); + put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); return 0; @@ -7642,7 +7862,7 @@ static int __init migration_init(void) migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - return err; + return 0; } early_initcall(migration_init); #endif @@ -7689,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->__cpu_power) { + if (!group->cpu_power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -7713,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->__cpu_power != SCHED_LOAD_SCALE) { - printk(KERN_CONT " (__cpu_power = %d)", - group->__cpu_power); + if (group->cpu_power != SCHED_LOAD_SCALE) { + printk(KERN_CONT " (cpu_power = %d)", + group->cpu_power); } group = group->next; @@ -7858,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) rq->rd = rd; cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); @@ -8000,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span, continue; cpumask_clear(sched_group_cpus(sg)); - sg->__cpu_power = 0; + sg->cpu_power = 0; for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) @@ -8108,6 +8328,39 @@ struct static_sched_domain { DECLARE_BITMAP(span, CONFIG_NR_CPUS); }; +struct s_data { +#ifdef CONFIG_NUMA + int sd_allnodes; + cpumask_var_t domainspan; + cpumask_var_t covered; + cpumask_var_t notcovered; +#endif + cpumask_var_t nodemask; + cpumask_var_t this_sibling_map; + cpumask_var_t this_core_map; + cpumask_var_t send_covered; + cpumask_var_t tmpmask; + struct sched_group **sched_group_nodes; + struct root_domain *rd; +}; + +enum s_alloc { + sa_sched_groups = 0, + sa_rootdomain, + sa_tmpmask, + sa_send_covered, + sa_this_core_map, + sa_this_sibling_map, + sa_nodemask, + sa_sched_group_nodes, +#ifdef CONFIG_NUMA + sa_notcovered, + sa_covered, + sa_domainspan, +#endif + sa_none, +}; + /* * SMT sched-domains: */ @@ -8225,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) continue; } - sg_inc_cpu_power(sg, sd->groups->__cpu_power); + sg->cpu_power += sd->groups->cpu_power; } sg = sg->next; } while (sg != group_head); } + +static int build_numa_sched_groups(struct s_data *d, + const struct cpumask *cpu_map, int num) +{ + struct sched_domain *sd; + struct sched_group *sg, *prev; + int n, j; + + cpumask_clear(d->covered); + cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); + if (cpumask_empty(d->nodemask)) { + d->sched_group_nodes[num] = NULL; + goto out; + } + + sched_domain_node_span(num, d->domainspan); + cpumask_and(d->domainspan, d->domainspan, cpu_map); + + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for node %d\n", + num); + return -ENOMEM; + } + d->sched_group_nodes[num] = sg; + + for_each_cpu(j, d->nodemask) { + sd = &per_cpu(node_domains, j |