aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-02-28 10:31:01 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2010-02-28 10:31:01 -0800
commitf66ffdedbf0fc059a92219bb08c1dbcac88f074b (patch)
tree9db4ad51764455123130e82fb7acf4f0a0be58ce
parent2531216f236cb2a1f39ffa12a4a9339541e52191 (diff)
parentdd5feea14a7de4edbd9f36db1a2db785de91b88d (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (25 commits) sched: Fix SCHED_MC regression caused by change in sched cpu_power sched: Don't use possibly stale sched_class kthread, sched: Remove reference to kthread_create_on_cpu sched: cpuacct: Use bigger percpu counter batch values for stats counters percpu_counter: Make __percpu_counter_add an inline function on UP sched: Remove member rt_se from struct rt_rq sched: Change usage of rt_rq->rt_se to rt_rq->tg->rt_se[cpu] sched: Remove unused update_shares_locked() sched: Use for_each_bit sched: Queue a deboosted task to the head of the RT prio queue sched: Implement head queueing for sched_rt sched: Extend enqueue_task to allow head queueing sched: Remove USER_SCHED sched: Fix the place where group powers are updated sched: Assume *balance is valid sched: Remove load_balance_newidle() sched: Unify load_balance{,_newidle}() sched: Add a lock break for PREEMPT=y sched: Remove from fwd decls sched: Remove rq_iterator from move_one_task ... Fix up trivial conflicts in kernel/sched.c
-rw-r--r--Documentation/feature-removal-schedule.txt15
-rw-r--r--include/linux/kernel.h5
-rw-r--r--include/linux/percpu_counter.h9
-rw-r--r--include/linux/sched.h25
-rw-r--r--init/Kconfig81
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/sched.c2125
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c1699
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c54
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/user.c305
14 files changed, 1827 insertions, 2533 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index b9eba900e0f..ea401495528 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,21 +6,6 @@ be removed from this file.
---------------------------
-What: USER_SCHED
-When: 2.6.34
-
-Why: USER_SCHED was implemented as a proof of concept for group scheduling.
- The effect of USER_SCHED can already be achieved from userspace with
- the help of libcgroup. The removal of USER_SCHED will also simplify
- the scheduler code with the removal of one major ifdef. There are also
- issues USER_SCHED has with USER_NS. A decision was taken not to fix
- those and instead remove USER_SCHED. Also new group scheduling
- features will not be implemented for USER_SCHED.
-
-Who: Dhaval Giani <dhaval@linux.vnet.ibm.com>
-
----------------------------
-
What: PRISM54
When: 2.6.34
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 328bca609b9..1221d2331a6 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -124,7 +124,7 @@ extern int _cond_resched(void);
#endif
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
- void __might_sleep(char *file, int line, int preempt_offset);
+ void __might_sleep(const char *file, int line, int preempt_offset);
/**
* might_sleep - annotation for functions that can sleep
*
@@ -138,7 +138,8 @@ extern int _cond_resched(void);
# define might_sleep() \
do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
#else
- static inline void __might_sleep(char *file, int line, int preempt_offset) { }
+ static inline void __might_sleep(const char *file, int line,
+ int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0)
#endif
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index a7684a51399..794662b2be5 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -98,9 +98,6 @@ static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
fbc->count = amount;
}
-#define __percpu_counter_add(fbc, amount, batch) \
- percpu_counter_add(fbc, amount)
-
static inline void
percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
@@ -109,6 +106,12 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
preempt_enable();
}
+static inline void
+__percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
+{
+ percpu_counter_add(fbc, amount);
+}
+
static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
return fbc->count;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1f5fa53b46b..0eef87b58ea 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -740,14 +740,6 @@ struct user_struct {
uid_t uid;
struct user_namespace *user_ns;
-#ifdef CONFIG_USER_SCHED
- struct task_group *tg;
-#ifdef CONFIG_SYSFS
- struct kobject kobj;
- struct delayed_work work;
-#endif
-#endif
-
#ifdef CONFIG_PERF_EVENTS
atomic_long_t locked_vm;
#endif
@@ -1087,7 +1079,8 @@ struct sched_domain;
struct sched_class {
const struct sched_class *next;
- void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
+ bool head);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
void (*yield_task) (struct rq *rq);
@@ -1099,14 +1092,6 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
- unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
- struct rq *busiest, unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio);
-
- int (*move_one_task) (struct rq *this_rq, int this_cpu,
- struct rq *busiest, struct sched_domain *sd,
- enum cpu_idle_type idle);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct rq *this_rq, struct task_struct *task);
@@ -2520,13 +2505,9 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
extern void normalize_rt_tasks(void);
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
extern struct task_group init_task_group;
-#ifdef CONFIG_USER_SCHED
-extern struct task_group root_task_group;
-extern void set_tg_uid(struct user_struct *user);
-#endif
extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
diff --git a/init/Kconfig b/init/Kconfig
index c6d95f8ea05..089a230e565 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -461,57 +461,6 @@ config LOG_BUF_SHIFT
config HAVE_UNSTABLE_SCHED_CLOCK
bool
-config GROUP_SCHED
- bool "Group CPU scheduler"
- depends on EXPERIMENTAL
- default n
- help
- This feature lets CPU scheduler recognize task groups and control CPU
- bandwidth allocation to such task groups.
- In order to create a group from arbitrary set of processes, use
- CONFIG_CGROUPS. (See Control Group support.)
-
-config FAIR_GROUP_SCHED
- bool "Group scheduling for SCHED_OTHER"
- depends on GROUP_SCHED
- default GROUP_SCHED
-
-config RT_GROUP_SCHED
- bool "Group scheduling for SCHED_RR/FIFO"
- depends on EXPERIMENTAL
- depends on GROUP_SCHED
- default n
- help
- This feature lets you explicitly allocate real CPU bandwidth
- to users or control groups (depending on the "Basis for grouping tasks"
- setting below. If enabled, it will also make it impossible to
- schedule realtime tasks for non-root users until you allocate
- realtime bandwidth for them.
- See Documentation/scheduler/sched-rt-group.txt for more information.
-
-choice
- depends on GROUP_SCHED
- prompt "Basis for grouping tasks"
- default USER_SCHED
-
-config USER_SCHED
- bool "user id"
- help
- This option will choose userid as the basis for grouping
- tasks, thus providing equal CPU bandwidth to each user.
-
-config CGROUP_SCHED
- bool "Control groups"
- depends on CGROUPS
- help
- This option allows you to create arbitrary task groups
- using the "cgroup" pseudo filesystem and control
- the cpu bandwidth allocated to each such task group.
- Refer to Documentation/cgroups/cgroups.txt for more
- information on "cgroup" pseudo filesystem.
-
-endchoice
-
menuconfig CGROUPS
boolean "Control Group support"
help
@@ -632,6 +581,36 @@ config CGROUP_MEM_RES_CTLR_SWAP
Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
size is 4096bytes, 512k per 1Gbytes of swap.
+menuconfig CGROUP_SCHED
+ bool "Group CPU scheduler"
+ depends on EXPERIMENTAL && CGROUPS
+ default n
+ help
+ This feature lets CPU scheduler recognize task groups and control CPU
+ bandwidth allocation to such task groups. It uses cgroups to group
+ tasks.
+
+if CGROUP_SCHED
+config FAIR_GROUP_SCHED
+ bool "Group scheduling for SCHED_OTHER"
+ depends on CGROUP_SCHED
+ default CGROUP_SCHED
+
+config RT_GROUP_SCHED
+ bool "Group scheduling for SCHED_RR/FIFO"
+ depends on EXPERIMENTAL
+ depends on CGROUP_SCHED
+ default n
+ help
+ This feature lets you explicitly allocate real CPU bandwidth
+ to users or control groups (depending on the "Basis for grouping tasks"
+ setting below. If enabled, it will also make it impossible to
+ schedule realtime tasks for non-root users until you allocate
+ realtime bandwidth for them.
+ See Documentation/scheduler/sched-rt-group.txt for more information.
+
+endif #CGROUP_SCHED
+
endif # CGROUPS
config MM_OWNER
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a7451..6b1ccc3f020 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
goto group_exit;
}
- /* create the /sys/kernel/uids/ directory */
- error = uids_sysfs_init();
- if (error)
- goto notes_exit;
-
return 0;
-notes_exit:
- if (notes_size > 0)
- sysfs_remove_bin_file(kernel_kobj, &notes_attr);
group_exit:
sysfs_remove_group(kernel_kobj, &kernel_attr_group);
kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e..82ed0ea1519 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run(), kthread_create_on_cpu().
+ * it. See also kthread_run().
*
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
diff --git a/kernel/sched.c b/kernel/sched.c
index caf54e1eef6..6a212c97f52 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
*/
static DEFINE_MUTEX(sched_domains_mutex);
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
#include <linux/cgroup.h>
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
/* task group related information */
struct task_group {
-#ifdef CONFIG_CGROUP_SCHED
struct cgroup_subsys_state css;
-#endif
-
-#ifdef CONFIG_USER_SCHED
- uid_t uid;
-#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
struct list_head children;
};
-#ifdef CONFIG_USER_SCHED
-
-/* Helper function to pass uid information to create_sched_user() */
-void set_tg_uid(struct user_struct *user)
-{
- user->tg->uid = user->uid;
-}
-
-/*
- * Root task group.
- * Every UID task group (including init_task_group aka UID-0) will
- * be a child to this group.
- */
-struct task_group root_task_group;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Default task group's sched entity on each cpu */
-static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
-/* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
-#endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
-#endif /* CONFIG_USER_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
}
#endif
-#ifdef CONFIG_USER_SCHED
-# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
-#else /* !CONFIG_USER_SCHED */
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
-#endif /* CONFIG_USER_SCHED */
/*
* A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
{
struct task_group *tg;
-#ifdef CONFIG_USER_SCHED
- rcu_read_lock();
- tg = __task_cred(p)->user->tg;
- rcu_read_unlock();
-#elif defined(CONFIG_CGROUP_SCHED)
+#ifdef CONFIG_CGROUP_SCHED
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
struct task_group, css);
#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
return NULL;
}
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
/* CFS-related fields in a runqueue */
struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
- struct sched_rt_entity *rt_se;
#endif
};
@@ -1414,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
-
-/*
- * runqueue iterator, to support SMP load-balancing between different
- * scheduling classes, without having to expose their internal data
- * structures to the load-balancing proper:
- */
-struct rq_iterator {
- void *arg;
- struct task_struct *(*start)(void *);
- struct task_struct *(*next)(void *);
-};
-
-#ifdef CONFIG_SMP
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *all_pinned,
- int *this_best_prio, struct rq_iterator *iterator);
-
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- struct sched_domain *sd, enum cpu_idle_type idle,
- struct rq_iterator *iterator);
-#endif
-
/* Time spent by the tasks of the cpu accounting group executing in ... */
enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */
@@ -1725,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
}
}
-static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
- if (root_task_group_empty())
- return;
-
- raw_spin_unlock(&rq->lock);
- update_shares(sd);
- raw_spin_lock(&rq->lock);
-}
-
static void update_h_load(long cpu)
{
if (root_task_group_empty())
@@ -1749,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
{
}
-static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-}
-
#endif
#ifdef CONFIG_PREEMPT
@@ -1829,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
raw_spin_unlock(&busiest->lock);
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (rq1 == rq2) {
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+ } else {
+ if (rq1 < rq2) {
+ raw_spin_lock(&rq1->lock);
+ raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ raw_spin_lock(&rq2->lock);
+ raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ }
+ }
+ update_rq_clock(rq1);
+ update_rq_clock(rq2);
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ raw_spin_unlock(&rq1->lock);
+ if (rq1 != rq2)
+ raw_spin_unlock(&rq2->lock);
+ else
+ __release(rq2->lock);
+}
+
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1858,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
#endif
}
-#include "sched_stats.h"
-#include "sched_idletask.c"
-#include "sched_fair.c"
-#include "sched_rt.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
+static const struct sched_class rt_sched_class;
#define sched_class_highest (&rt_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
+#include "sched_stats.h"
+
static void inc_nr_running(struct rq *rq)
{
rq->nr_running++;
@@ -1907,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
*avg += diff >> 3;
}
-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
{
if (wakeup)
p->se.start_runtime = p->se.sum_exec_runtime;
sched_info_queued(p);
- p->sched_class->enqueue_task(rq, p, wakeup);
+ p->sched_class->enqueue_task(rq, p, wakeup, head);
p->se.on_rq = 1;
}
@@ -1936,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
}
/*
+ * activate_task - move a task to the runqueue.
+ */
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+{
+ if (task_contributes_to_load(p))
+ rq->nr_uninterruptible--;
+
+ enqueue_task(rq, p, wakeup, false);
+ inc_nr_running(rq);
+}
+
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+{
+ if (task_contributes_to_load(p))
+ rq->nr_uninterruptible++;
+
+ dequeue_task(rq, p, sleep);
+ dec_nr_running(rq);
+}
+
+#include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif
+
+/*
* __normal_prio - return the priority that is based on the static prio
*/
static inline int __normal_prio(struct task_struct *p)
@@ -1981,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
return p->prio;
}
-/*
- * activate_task - move a task to the runqueue.
- */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
-{
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible--;
-
- enqueue_task(rq, p, wakeup);
- inc_nr_running(rq);
-}
-
-/*
- * deactivate_task - remove a task from the runqueue.
- */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
-{
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible++;
-
- dequeue_task(rq, p, sleep);
- dec_nr_running(rq);
-}
-
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
@@ -3148,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
#ifdef CONFIG_SMP
/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
- __acquire(rq2->lock); /* Fake it out ;) */
- } else {
- if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
- } else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
- }
- }
- update_rq_clock(rq1);
- update_rq_clock(rq2);
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- __releases(rq1->lock)
- __releases(rq2->lock)
-{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
- else
- __release(rq2->lock);
-}
-
-/*
* sched_exec - execve() is a valuable balancing opportunity, because at
* this point the task has the smallest effective memory and cache footprint.
*/
@@ -3239,1782 +3161,6 @@ again:
task_rq_unlock(rq, &flags);
}
-/*
- * pull_task - move a task from a remote runqueue to the local runqueue.
- * Both runqueues must be locked.
- */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
- struct rq *this_rq, int this_cpu)
-{
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
- check_preempt_curr(this_rq, p, 0);
-}
-
-/*
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- */
-static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
-{
- int tsk_cache_hot = 0;
- /*
- * We do not migrate tasks that are:
- * 1) running (obviously), or
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
- * 3) are cache-hot on their current CPU.
- */
- if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
- schedstat_inc(p, se.nr_failed_migrations_affine);
- return 0;
- }
- *all_pinned = 0;
-
- if (task_running(rq, p)) {
- schedstat_inc(p, se.nr_failed_migrations_running);
- return 0;
- }
-
- /*
- * Aggressive migration if:
- * 1) task is cache cold, or
- * 2) too many balance attempts have failed.
- */
-
- tsk_cache_hot = task_hot(p, rq->clock, sd);
- if (!tsk_cache_hot ||
- sd->nr_balance_failed > sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
- if (tsk_cache_hot) {
- schedstat_inc(sd, lb_hot_gained[idle]);
- schedstat_inc(p, se.nr_forced_migrations);
- }
-#endif
- return 1;
- }
-
- if (tsk_cache_hot) {
- schedstat_inc(p, se.nr_failed_migrations_hot);
- return 0;
- }
- return 1;
-}
-
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *all_pinned,
- int *this_best_prio, struct rq_iterator *iterator)
-{
- int loops = 0, pulled = 0, pinned = 0;
- struct task_struct *p;
- long rem_load_move = max_load_move;
-
- if (max_load_move == 0)
- goto out;
-
- pinned = 1;
-
- /*
- * Start the load-balancing iterator:
- */
- p = iterator->start(iterator->arg);
-next:
- if (!p || loops++ > sysctl_sched_nr_migrate)
- goto out;
-
- if ((p->se.load.weight >> 1) > rem_load_move ||
- !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
- p = iterator->next(iterator->arg);
- goto next;
- }
-
- pull_task(busiest, p, this_rq, this_cpu);
- pulled++;
- rem_load_move -= p->se.load.weight;
-
-#ifdef CONFIG_PREEMPT
- /*
- * NEWIDLE balancing is a source of latency, so preemptible kernels
- * will stop after the first task is pulled to minimize the critical
- * section.
- */
- if (idle == CPU_NEWLY_IDLE)
- goto out;
-#endif
-
- /*
- * We only want to steal up to the prescribed amount of weighted load.
- */
- if (rem_load_move > 0) {
- if (p->prio < *this_best_prio)
- *this_best_prio = p->prio;
- p = iterator->next(iterator->arg);
- goto next;
- }
-out:
- /*
- * Right now, this is one of only two places pull_task() is called,
- * so we can safely collect pull_task() stats here rather than
- * inside pull_task().
- */
- schedstat_add(sd, lb_gained[idle], pulled);
-
- if (all_pinned)
- *all_pinned = pinned;
-
- return max_load_move - rem_load_move;
-}
-
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
-{
- const struct sched_class *class = sched_class_highest;
- unsigned long total_load_moved = 0;
- int this_best_prio = this_rq->curr->prio;
-
- do {
- total_load_moved +=
- class->load_balance(this_rq, this_cpu, busiest,
- max_load_move - total_load_moved,
- sd, idle, all_pinned, &this_best_prio);
- class = class->next;
-
-#ifdef CONFIG_PREEMPT
- /*
- * NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
- * the critical section.
- */
- if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
- break;
-#endif
- } while (class && max_load_move > total_load_moved);
-
- return total_load_moved > 0;
-}
-
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- struct sched_domain *sd, enum cpu_idle_type idle,
- struct rq_iterator *iterator)
-{
- struct task_struct *p = iterator->start(iterator->arg);
- int pinned = 0;
-
- while (p) {
- if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
- pull_task(busiest, p, this_rq, this_cpu);
- /*
- * Right now, this is only the second place pull_task()
- * is called, so we can safely collect pull_task()
- * stats here rather than inside pull_task().
- */
- schedstat_inc(sd, lb_gained[idle]);
-
- return 1;
- }
- p = iterator->next(iterator->arg);
- }
-
- return 0;
-}
-
-/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- struct sched_domain *sd, enum cpu_idle_type idle)
-{
- const struct sched_class *class;
-
- for_each_class(class) {
- if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
- return 1;
- }
-
- return 0;
-}
-/********** Helpers for find_busiest_group ************************/
-/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
- */
-struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *this; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_pwr; /* Total power of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
-
- /** Statistics of this group */
- unsigned long this_load;
- unsigned long this_load_per_task;
- unsigned long this_nr_running;
-
- /* Statistics of the busiest group */
- unsigned long max_load;
- unsigned long busiest_load_per_task;
- unsigned long busiest_nr_running;
-
- int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance; /* Is powersave balance needed for this sd */
- struct sched_group *group_min; /* Least loaded group in sd */
- struct sched_group *group_leader; /* Group which relieves group_min */
- unsigned long min_load_per_task; /* load_per_task in group_min */
- unsigned long leader_nr_running; /* Nr running of group_leader */
- unsigned long min_nr_running; /* Nr running of group_min */
-#endif
-};
-
-/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
- */
-struct sg_lb_stats {
- unsigned long avg_load; /*Avg load across the CPUs of the group */
- unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long sum_nr_running; /* Nr tasks running in the group */
- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
- unsigned long group_capacity;
- int group_imb; /* Is there an imbalance in the group ? */
-};
-
-/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
- return cpumask_first(sched_group_cpus(group));
-}
-
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
- enum cpu_idle_type idle)
-{
- int load_idx;
-
- switch (idle) {
- case CPU_NOT_IDLE:
- load_idx = sd->busy_idx;
- break;
-
- case CPU_NEWLY_IDLE:
- load_idx = sd->newidle_idx;
- break;
- default:
- load_idx = sd->idle_idx;
- break;
- }
-
- return load_idx;
-}
-
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
- /*
- * Busy processors will not participate in power savings
- * balance.
- */
- if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- sds->power_savings_balance = 0;
- else {
- sds->power_savings_balance = 1;
- sds->min_nr_running = ULONG_MAX;
- sds->leader_nr_running = 0;
- }
-}
-
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- * load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
- struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-
- if (!sds->power_savings_balance)
- return;
-
- /*
- * If the local group is idle or completely loaded
- * no need to do power savings balance at this domain
- */
- if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
- !sds->this_nr_running))
- sds->power_savings_balance = 0;
-
- /*
- * If a group is already running at full capacity or idle,
- * don't include that group in power savings calculations
- */
- if (!sds->power_savings_balance ||
- sgs->sum_nr_running >= sgs->group_capacity ||
- !sgs->sum_nr_runn