diff options
-rw-r--r-- | Documentation/feature-removal-schedule.txt | 15 | ||||
-rw-r--r-- | include/linux/kernel.h | 5 | ||||
-rw-r--r-- | include/linux/percpu_counter.h | 9 | ||||
-rw-r--r-- | include/linux/sched.h | 25 | ||||
-rw-r--r-- | init/Kconfig | 81 | ||||
-rw-r--r-- | kernel/ksysfs.c | 8 | ||||
-rw-r--r-- | kernel/kthread.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 2125 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 4 | ||||
-rw-r--r-- | kernel/sched_fair.c | 1699 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 23 | ||||
-rw-r--r-- | kernel/sched_rt.c | 54 | ||||
-rw-r--r-- | kernel/sys.c | 5 | ||||
-rw-r--r-- | kernel/user.c | 305 |
14 files changed, 1827 insertions, 2533 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index b9eba900e0f..ea401495528 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -6,21 +6,6 @@ be removed from this file. --------------------------- -What: USER_SCHED -When: 2.6.34 - -Why: USER_SCHED was implemented as a proof of concept for group scheduling. - The effect of USER_SCHED can already be achieved from userspace with - the help of libcgroup. The removal of USER_SCHED will also simplify - the scheduler code with the removal of one major ifdef. There are also - issues USER_SCHED has with USER_NS. A decision was taken not to fix - those and instead remove USER_SCHED. Also new group scheduling - features will not be implemented for USER_SCHED. - -Who: Dhaval Giani <dhaval@linux.vnet.ibm.com> - ---------------------------- - What: PRISM54 When: 2.6.34 diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 328bca609b9..1221d2331a6 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -124,7 +124,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - void __might_sleep(char *file, int line, int preempt_offset); + void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep * @@ -138,7 +138,8 @@ extern int _cond_resched(void); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) #else - static inline void __might_sleep(char *file, int line, int preempt_offset) { } + static inline void __might_sleep(const char *file, int line, + int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index a7684a51399..794662b2be5 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -98,9 +98,6 @@ static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount) fbc->count = amount; } -#define __percpu_counter_add(fbc, amount, batch) \ - percpu_counter_add(fbc, amount) - static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { @@ -109,6 +106,12 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) preempt_enable(); } +static inline void +__percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) +{ + percpu_counter_add(fbc, amount); +} + static inline s64 percpu_counter_read(struct percpu_counter *fbc) { return fbc->count; diff --git a/include/linux/sched.h b/include/linux/sched.h index 1f5fa53b46b..0eef87b58ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -740,14 +740,6 @@ struct user_struct { uid_t uid; struct user_namespace *user_ns; -#ifdef CONFIG_USER_SCHED - struct task_group *tg; -#ifdef CONFIG_SYSFS - struct kobject kobj; - struct delayed_work work; -#endif -#endif - #ifdef CONFIG_PERF_EVENTS atomic_long_t locked_vm; #endif @@ -1087,7 +1079,8 @@ struct sched_domain; struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, + bool head); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); void (*yield_task) (struct rq *rq); @@ -1099,14 +1092,6 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); - unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, - struct rq *busiest, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio); - - int (*move_one_task) (struct rq *this_rq, int this_cpu, - struct rq *busiest, struct sched_domain *sd, - enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct rq *this_rq, struct task_struct *task); @@ -2520,13 +2505,9 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); extern void normalize_rt_tasks(void); -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED extern struct task_group init_task_group; -#ifdef CONFIG_USER_SCHED -extern struct task_group root_task_group; -extern void set_tg_uid(struct user_struct *user); -#endif extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); diff --git a/init/Kconfig b/init/Kconfig index c6d95f8ea05..089a230e565 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -461,57 +461,6 @@ config LOG_BUF_SHIFT config HAVE_UNSTABLE_SCHED_CLOCK bool -config GROUP_SCHED - bool "Group CPU scheduler" - depends on EXPERIMENTAL - default n - help - This feature lets CPU scheduler recognize task groups and control CPU - bandwidth allocation to such task groups. - In order to create a group from arbitrary set of processes, use - CONFIG_CGROUPS. (See Control Group support.) - -config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on GROUP_SCHED - default GROUP_SCHED - -config RT_GROUP_SCHED - bool "Group scheduling for SCHED_RR/FIFO" - depends on EXPERIMENTAL - depends on GROUP_SCHED - default n - help - This feature lets you explicitly allocate real CPU bandwidth - to users or control groups (depending on the "Basis for grouping tasks" - setting below. If enabled, it will also make it impossible to - schedule realtime tasks for non-root users until you allocate - realtime bandwidth for them. - See Documentation/scheduler/sched-rt-group.txt for more information. - -choice - depends on GROUP_SCHED - prompt "Basis for grouping tasks" - default USER_SCHED - -config USER_SCHED - bool "user id" - help - This option will choose userid as the basis for grouping - tasks, thus providing equal CPU bandwidth to each user. - -config CGROUP_SCHED - bool "Control groups" - depends on CGROUPS - help - This option allows you to create arbitrary task groups - using the "cgroup" pseudo filesystem and control - the cpu bandwidth allocated to each such task group. - Refer to Documentation/cgroups/cgroups.txt for more - information on "cgroup" pseudo filesystem. - -endchoice - menuconfig CGROUPS boolean "Control Group support" help @@ -632,6 +581,36 @@ config CGROUP_MEM_RES_CTLR_SWAP Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +menuconfig CGROUP_SCHED + bool "Group CPU scheduler" + depends on EXPERIMENTAL && CGROUPS + default n + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +if CGROUP_SCHED +config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED + default CGROUP_SCHED + +config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on EXPERIMENTAL + depends on CGROUP_SCHED + default n + help + This feature lets you explicitly allocate real CPU bandwidth + to users or control groups (depending on the "Basis for grouping tasks" + setting below. If enabled, it will also make it impossible to + schedule realtime tasks for non-root users until you allocate + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.txt for more information. + +endif #CGROUP_SCHED + endif # CGROUPS config MM_OWNER diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3feaf5a7451..6b1ccc3f020 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -197,16 +197,8 @@ static int __init ksysfs_init(void) goto group_exit; } - /* create the /sys/kernel/uids/ directory */ - error = uids_sysfs_init(); - if (error) - goto notes_exit; - return 0; -notes_exit: - if (notes_size > 0) - sysfs_remove_bin_file(kernel_kobj, ¬es_attr); group_exit: sysfs_remove_group(kernel_kobj, &kernel_attr_group); kset_exit: diff --git a/kernel/kthread.c b/kernel/kthread.c index fbb6222fe7e..82ed0ea1519 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create) * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(), kthread_create_on_cpu(). + * it. See also kthread_run(). * * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a diff --git a/kernel/sched.c b/kernel/sched.c index caf54e1eef6..6a212c97f52 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) */ static DEFINE_MUTEX(sched_domains_mutex); -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED #include <linux/cgroup.h> @@ -243,13 +243,7 @@ static LIST_HEAD(task_groups); /* task group related information */ struct task_group { -#ifdef CONFIG_CGROUP_SCHED struct cgroup_subsys_state css; -#endif - -#ifdef CONFIG_USER_SCHED - uid_t uid; -#endif #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ @@ -274,35 +268,7 @@ struct task_group { struct list_head children; }; -#ifdef CONFIG_USER_SCHED - -/* Helper function to pass uid information to create_sched_user() */ -void set_tg_uid(struct user_struct *user) -{ - user->tg->uid = user->uid; -} - -/* - * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. - */ -struct task_group root_task_group; - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Default task group's sched entity on each cpu */ -static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); -/* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var); -#endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -318,11 +284,7 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_USER_SCHED -# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else /* !CONFIG_USER_SCHED */ # define INIT_TASK_GROUP_LOAD NICE_0_LOAD -#endif /* CONFIG_USER_SCHED */ /* * A weight of 0 or 1 can cause arithmetics problems. @@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p) { struct task_group *tg; -#ifdef CONFIG_USER_SCHED - rcu_read_lock(); - tg = __task_cred(p)->user->tg; - rcu_read_unlock(); -#elif defined(CONFIG_CGROUP_SCHED) +#ifdef CONFIG_CGROUP_SCHED tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); #else @@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p) return NULL; } -#endif /* CONFIG_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ /* CFS-related fields in a runqueue */ struct cfs_rq { @@ -478,7 +436,6 @@ struct rt_rq { struct rq *rq; struct list_head leaf_rt_rq_list; struct task_group *tg; - struct sched_rt_entity *rt_se; #endif }; @@ -1414,32 +1371,6 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); - -/* - * runqueue iterator, to support SMP load-balancing between different - * scheduling classes, without having to expose their internal data - * structures to the load-balancing proper: - */ -struct rq_iterator { - void *arg; - struct task_struct *(*start)(void *); - struct task_struct *(*next)(void *); -}; - -#ifdef CONFIG_SMP -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct rq_iterator *iterator); - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle, - struct rq_iterator *iterator); -#endif - /* Time spent by the tasks of the cpu accounting group executing in ... */ enum cpuacct_stat_index { CPUACCT_STAT_USER, /* ... user mode */ @@ -1725,16 +1656,6 @@ static void update_shares(struct sched_domain *sd) } } -static void update_shares_locked(struct rq *rq, struct sched_domain *sd) -{ - if (root_task_group_empty()) - return; - - raw_spin_unlock(&rq->lock); - update_shares(sd); - raw_spin_lock(&rq->lock); -} - static void update_h_load(long cpu) { if (root_task_group_empty()) @@ -1749,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd) { } -static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) -{ -} - #endif #ifdef CONFIG_PREEMPT @@ -1829,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) raw_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } + update_rq_clock(rq1); + update_rq_clock(rq2); +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1858,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) #endif } -#include "sched_stats.h" -#include "sched_idletask.c" -#include "sched_fair.c" -#include "sched_rt.c" -#ifdef CONFIG_SCHED_DEBUG -# include "sched_debug.c" -#endif +static const struct sched_class rt_sched_class; #define sched_class_highest (&rt_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) +#include "sched_stats.h" + static void inc_nr_running(struct rq *rq) { rq->nr_running++; @@ -1907,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) { if (wakeup) p->se.start_runtime = p->se.sum_exec_runtime; sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); + p->sched_class->enqueue_task(rq, p, wakeup, head); p->se.on_rq = 1; } @@ -1936,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) } /* + * activate_task - move a task to the runqueue. + */ +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + + enqueue_task(rq, p, wakeup, false); + inc_nr_running(rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible++; + + dequeue_task(rq, p, sleep); + dec_nr_running(rq); +} + +#include "sched_idletask.c" +#include "sched_fair.c" +#include "sched_rt.c" +#ifdef CONFIG_SCHED_DEBUG +# include "sched_debug.c" +#endif + +/* * __normal_prio - return the priority that is based on the static prio */ static inline int __normal_prio(struct task_struct *p) @@ -1981,30 +1971,6 @@ static int effective_prio(struct task_struct *p) return p->prio; } -/* - * activate_task - move a task to the runqueue. - */ -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - - enqueue_task(rq, p, wakeup); - inc_nr_running(rq); -} - -/* - * deactivate_task - remove a task from the runqueue. - */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - - dequeue_task(rq, p, sleep); - dec_nr_running(rq); -} - /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@ -3148,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq) #ifdef CONFIG_SMP /* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - raw_spin_lock(&rq1->lock); - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - raw_spin_lock(&rq2->lock); - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } - update_rq_clock(rq1); - update_rq_clock(rq2); -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - raw_spin_unlock(&rq1->lock); - if (rq1 != rq2) - raw_spin_unlock(&rq2->lock); - else - __release(rq2->lock); -} - -/* * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. */ @@ -3239,1782 +3161,6 @@ again: task_rq_unlock(rq, &flags); } -/* - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. - */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) -{ - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - check_preempt_curr(this_rq, p, 0); -} - -/* - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? - */ -static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) -{ - int tsk_cache_hot = 0; - /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. - */ - if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { - schedstat_inc(p, se.nr_failed_migrations_affine); - return 0; - } - *all_pinned = 0; - - if (task_running(rq, p)) { - schedstat_inc(p, se.nr_failed_migrations_running); - return 0; - } - - /* - * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. - */ - - tsk_cache_hot = task_hot(p, rq->clock, sd); - if (!tsk_cache_hot || - sd->nr_balance_failed > sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS - if (tsk_cache_hot) { - schedstat_inc(sd, lb_hot_gained[idle]); - schedstat_inc(p, se.nr_forced_migrations); - } -#endif - return 1; - } - - if (tsk_cache_hot) { - schedstat_inc(p, se.nr_failed_migrations_hot); - return 0; - } - return 1; -} - -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct rq_iterator *iterator) -{ - int loops = 0, pulled = 0, pinned = 0; - struct task_struct *p; - long rem_load_move = max_load_move; - - if (max_load_move == 0) - goto out; - - pinned = 1; - - /* - * Start the load-balancing iterator: - */ - p = iterator->start(iterator->arg); -next: - if (!p || loops++ > sysctl_sched_nr_migrate) - goto out; - - if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { - p = iterator->next(iterator->arg); - goto next; - } - - pull_task(busiest, p, this_rq, this_cpu); - pulled++; - rem_load_move -= p->se.load.weight; - -#ifdef CONFIG_PREEMPT - /* - * NEWIDLE balancing is a source of latency, so preemptible kernels - * will stop after the first task is pulled to minimize the critical - * section. - */ - if (idle == CPU_NEWLY_IDLE) - goto out; -#endif - - /* - * We only want to steal up to the prescribed amount of weighted load. - */ - if (rem_load_move > 0) { - if (p->prio < *this_best_prio) - *this_best_prio = p->prio; - p = iterator->next(iterator->arg); - goto next; - } -out: - /* - * Right now, this is one of only two places pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). - */ - schedstat_add(sd, lb_gained[idle], pulled); - - if (all_pinned) - *all_pinned = pinned; - - return max_load_move - rem_load_move; -} - -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) -{ - const struct sched_class *class = sched_class_highest; - unsigned long total_load_moved = 0; - int this_best_prio = this_rq->curr->prio; - - do { - total_load_moved += - class->load_balance(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, all_pinned, &this_best_prio); - class = class->next; - -#ifdef CONFIG_PREEMPT - /* - * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize - * the critical section. - */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) - break; -#endif - } while (class && max_load_move > total_load_moved); - - return total_load_moved > 0; -} - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle, - struct rq_iterator *iterator) -{ - struct task_struct *p = iterator->start(iterator->arg); - int pinned = 0; - - while (p) { - if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { - pull_task(busiest, p, this_rq, this_cpu); - /* - * Right now, this is only the second place pull_task() - * is called, so we can safely collect pull_task() - * stats here rather than inside pull_task(). - */ - schedstat_inc(sd, lb_gained[idle]); - - return 1; - } - p = iterator->next(iterator->arg); - } - - return 0; -} - -/* - * move_one_task tries to move exactly one task from busiest to this_rq, as - * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - const struct sched_class *class; - - for_each_class(class) { - if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) - return 1; - } - - return 0; -} -/********** Helpers for find_busiest_group ************************/ -/* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. - */ -struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *this; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - - /** Statistics of this group */ - unsigned long this_load; - unsigned long this_load_per_task; - unsigned long this_nr_running; - - /* Statistics of the busiest group */ - unsigned long max_load; - unsigned long busiest_load_per_task; - unsigned long busiest_nr_running; - - int group_imb; /* Is there imbalance in this sd */ -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance; /* Is powersave balance needed for this sd */ - struct sched_group *group_min; /* Least loaded group in sd */ - struct sched_group *group_leader; /* Group which relieves group_min */ - unsigned long min_load_per_task; /* load_per_task in group_min */ - unsigned long leader_nr_running; /* Nr running of group_leader */ - unsigned long min_nr_running; /* Nr running of group_min */ -#endif -}; - -/* - * sg_lb_stats - stats of a sched_group required for load_balancing - */ -struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_nr_running; /* Nr tasks running in the group */ - unsigned long sum_weighted_load; /* Weighted load of group's tasks */ - unsigned long group_capacity; - int group_imb; /* Is there an imbalance in the group ? */ -}; - -/** - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. - * @group: The group whose first cpu is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ - return cpumask_first(sched_group_cpus(group)); -} - -/** - * get_sd_load_idx - Obtain the load index for a given sched domain. - * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The Idle status of the CPU for whose sd load_icx is obtained. - */ -static inline int get_sd_load_idx(struct sched_domain *sd, - enum cpu_idle_type idle) -{ - int load_idx; - - switch (idle) { - case CPU_NOT_IDLE: - load_idx = sd->busy_idx; - break; - - case CPU_NEWLY_IDLE: - load_idx = sd->newidle_idx; - break; - default: - load_idx = sd->idle_idx; - break; - } - - return load_idx; -} - - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * init_sd_power_savings_stats - Initialize power savings statistics for - * the given sched_domain, during load balancing. - * - * @sd: Sched domain whose power-savings statistics are to be initialized. - * @sds: Variable containing the statistics for sd. - * @idle: Idle status of the CPU at which we're performing load-balancing. - */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } -} - -/** - * update_sd_power_savings_stats - Update the power saving stats for a - * sched_domain while performing load balancing. - * - * @group: sched_group belonging to the sched_domain under consideration. - * @sds: Variable containing the statistics of the sched_domain - * @local_group: Does group contain the CPU for which we're performing - * load balancing ? - * @sgs: Variable containing the statistics of the group. - */ -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - - if (!sds->power_savings_balance) - return; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && (sds->this_nr_running >= sgs->group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs->sum_nr_running >= sgs->group_capacity || - !sgs->sum_nr_running) - return; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs->sum_nr_running < sds->min_nr_running) || - (sgs->sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs->sum_nr_running; - sds->min_load_per_task = sgs->sum_weighted_load / - sgs->sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs->sum_nr_running + 1 > sgs->group_capacity) - return; - - if (sgs->sum_nr_running > sds->leader_nr_running || - (sgs->sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs->sum_nr_running; - } -} - -/** - * check_power_save_busiest_group - see if there is potential for some power-savings balance - * @sds: Variable containing the statistics of the sched_domain - * under consideration. - * @this_cpu: Cpu at which we're currently performing load-balancing. - * @imbalance: Variable to store the imbalance. - * - * Description: - * Check if we have potential to perform some power-savings balance. - * If yes, set the busiest group to be the least loaded group in the - * sched_domain, so that it's CPUs can be put to idle. - * - * Returns 1 if there is potential to perform power-savings balance. - * Else returns 0. - */ -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance)< |