diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-03-26 16:05:01 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-03-26 16:05:01 -0700 |
commit | 831576fe40f4175e0767623cffa4aeb28157943a (patch) | |
tree | de54e628e5849d6cf201df4446d760d17f752326 | |
parent | 21cdbc1378e8aa96e1ed4a606dce1a8e7daf7fdf (diff) | |
parent | 66fef08f7d5267b2312c4b48a6d2957d2d414105 (diff) |
Merge branch 'sched-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (46 commits)
sched: Add comments to find_busiest_group() function
sched: Refactor the power savings balance code
sched: Optimize the !power_savings_balance during fbg()
sched: Create a helper function to calculate imbalance
sched: Create helper to calculate small_imbalance in fbg()
sched: Create a helper function to calculate sched_domain stats for fbg()
sched: Define structure to store the sched_domain statistics for fbg()
sched: Create a helper function to calculate sched_group stats for fbg()
sched: Define structure to store the sched_group statistics for fbg()
sched: Fix indentations in find_busiest_group() using gotos
sched: Simple helper functions for find_busiest_group()
sched: remove unused fields from struct rq
sched: jiffies not printed per CPU
sched: small optimisation of can_migrate_task()
sched: fix typos in documentation
sched: add avg_overlap decay
x86, sched_clock(): mark variables read-mostly
sched: optimize ttwu vs group scheduling
sched: TIF_NEED_RESCHED -> need_reshed() cleanup
sched: don't rebalance if attached on NULL domain
...
-rw-r--r-- | Documentation/scheduler/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/scheduler/sched-coding.txt | 126 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/tsc.c | 9 | ||||
-rw-r--r-- | include/linux/init_task.h | 1 | ||||
-rw-r--r-- | include/linux/latencytop.h | 10 | ||||
-rw-r--r-- | include/linux/plist.h | 9 | ||||
-rw-r--r-- | include/linux/sched.h | 17 | ||||
-rw-r--r-- | init/Kconfig | 1 | ||||
-rw-r--r-- | kernel/latencytop.c | 83 | ||||
-rw-r--r-- | kernel/sched.c | 982 | ||||
-rw-r--r-- | kernel/sched_clock.c | 30 | ||||
-rw-r--r-- | kernel/sched_debug.c | 8 | ||||
-rw-r--r-- | kernel/sched_fair.c | 59 | ||||
-rw-r--r-- | kernel/sched_features.h | 3 | ||||
-rw-r--r-- | kernel/sched_rt.c | 537 | ||||
-rw-r--r-- | kernel/sched_stats.h | 7 | ||||
-rw-r--r-- | lib/Kconfig | 6 | ||||
-rw-r--r-- | lib/Makefile | 4 | ||||
-rw-r--r-- | lib/kernel_lock.c | 2 |
20 files changed, 1262 insertions, 642 deletions
diff --git a/Documentation/scheduler/00-INDEX b/Documentation/scheduler/00-INDEX index aabcc3a089b..3c00c9c3219 100644 --- a/Documentation/scheduler/00-INDEX +++ b/Documentation/scheduler/00-INDEX @@ -2,8 +2,6 @@ - this file. sched-arch.txt - CPU Scheduler implementation hints for architecture specific code. -sched-coding.txt - - reference for various scheduler-related methods in the O(1) scheduler. sched-design-CFS.txt - goals, design and implementation of the Complete Fair Scheduler. sched-domains.txt diff --git a/Documentation/scheduler/sched-coding.txt b/Documentation/scheduler/sched-coding.txt deleted file mode 100644 index cbd8db752ac..00000000000 --- a/Documentation/scheduler/sched-coding.txt +++ /dev/null @@ -1,126 +0,0 @@ - Reference for various scheduler-related methods in the O(1) scheduler - Robert Love <rml@tech9.net>, MontaVista Software - - -Note most of these methods are local to kernel/sched.c - this is by design. -The scheduler is meant to be self-contained and abstracted away. This document -is primarily for understanding the scheduler, not interfacing to it. Some of -the discussed interfaces, however, are general process/scheduling methods. -They are typically defined in include/linux/sched.h. - - -Main Scheduling Methods ------------------------ - -void load_balance(runqueue_t *this_rq, int idle) - Attempts to pull tasks from one cpu to another to balance cpu usage, - if needed. This method is called explicitly if the runqueues are - imbalanced or periodically by the timer tick. Prior to calling, - the current runqueue must be locked and interrupts disabled. - -void schedule() - The main scheduling function. Upon return, the highest priority - process will be active. - - -Locking -------- - -Each runqueue has its own lock, rq->lock. When multiple runqueues need -to be locked, lock acquires must be ordered by ascending &runqueue value. - -A specific runqueue is locked via - - task_rq_lock(task_t pid, unsigned long *flags) - -which disables preemption, disables interrupts, and locks the runqueue pid is -running on. Likewise, - - task_rq_unlock(task_t pid, unsigned long *flags) - -unlocks the runqueue pid is running on, restores interrupts to their previous -state, and reenables preemption. - -The routines - - double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) - -and - - double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) - -safely lock and unlock, respectively, the two specified runqueues. They do -not, however, disable and restore interrupts. Users are required to do so -manually before and after calls. - - -Values ------- - -MAX_PRIO - The maximum priority of the system, stored in the task as task->prio. - Lower priorities are higher. Normal (non-RT) priorities range from - MAX_RT_PRIO to (MAX_PRIO - 1). -MAX_RT_PRIO - The maximum real-time priority of the system. Valid RT priorities - range from 0 to (MAX_RT_PRIO - 1). -MAX_USER_RT_PRIO - The maximum real-time priority that is exported to user-space. Should - always be equal to or less than MAX_RT_PRIO. Setting it less allows - kernel threads to have higher priorities than any user-space task. -MIN_TIMESLICE -MAX_TIMESLICE - Respectively, the minimum and maximum timeslices (quanta) of a process. - -Data ----- - -struct runqueue - The main per-CPU runqueue data structure. -struct task_struct - The main per-process data structure. - - -General Methods ---------------- - -cpu_rq(cpu) - Returns the runqueue of the specified cpu. -this_rq() - Returns the runqueue of the current cpu. -task_rq(pid) - Returns the runqueue which holds the specified pid. -cpu_curr(cpu) - Returns the task currently running on the given cpu. -rt_task(pid) - Returns true if pid is real-time, false if not. - - -Process Control Methods ------------------------ - -void set_user_nice(task_t *p, long nice) - Sets the "nice" value of task p to the given value. -int setscheduler(pid_t pid, int policy, struct sched_param *param) - Sets the scheduling policy and parameters for the given pid. -int set_cpus_allowed(task_t *p, unsigned long new_mask) - Sets a given task's CPU affinity and migrates it to a proper cpu. - Callers must have a valid reference to the task and assure the - task not exit prematurely. No locks can be held during the call. -set_task_state(tsk, state_value) - Sets the given task's state to the given value. -set_current_state(state_value) - Sets the current task's state to the given value. -void set_tsk_need_resched(struct task_struct *tsk) - Sets need_resched in the given task. -void clear_tsk_need_resched(struct task_struct *tsk) - Clears need_resched in the given task. -void set_need_resched() - Sets need_resched in the current task. -void clear_need_resched() - Clears need_resched in the current task. -int need_resched() - Returns true if need_resched is set in the current task, false - otherwise. -yield() - Place the current process at the end of the runqueue and call schedule. diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 24ff26a38ad..5fff00c70de 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -4,6 +4,7 @@ #include <linux/string.h> #include <linux/bitops.h> #include <linux/smp.h> +#include <linux/sched.h> #include <linux/thread_info.h> #include <linux/module.h> @@ -56,11 +57,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate - * with P/T states and does not stop in deep C-states + * with P/T states and does not stop in deep C-states. + * + * It is also reliable across cores and sockets. (but not across + * cabinets - we turn it off in that case explicitly.) */ if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); + sched_clock_stable = 1; } } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b8e7aaf7ef7..08afa1579e6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -17,20 +17,21 @@ #include <asm/delay.h> #include <asm/hypervisor.h> -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); -unsigned int tsc_khz; + +unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ -static int tsc_unstable; +static int __read_mostly tsc_unstable; /* native_sched_clock() is called before tsc_init(), so we must start with the TSC soft disabled to prevent erroneous rdtsc usage on !cpu_has_tsc processors */ -static int tsc_disabled = -1; +static int __read_mostly tsc_disabled = -1; static int tsc_clocksource_reliable; /* diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e752d973fa2..af1de95e711 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -147,6 +147,7 @@ extern struct cred init_cred; .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ .real_parent = &tsk, \ diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index 901c2d6377a..b0e99898527 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -9,6 +9,7 @@ #ifndef _INCLUDE_GUARD_LATENCYTOP_H_ #define _INCLUDE_GUARD_LATENCYTOP_H_ +#include <linux/compiler.h> #ifdef CONFIG_LATENCYTOP #define LT_SAVECOUNT 32 @@ -24,7 +25,14 @@ struct latency_record { struct task_struct; -void account_scheduler_latency(struct task_struct *task, int usecs, int inter); +extern int latencytop_enabled; +void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); +static inline void +account_scheduler_latency(struct task_struct *task, int usecs, int inter) +{ + if (unlikely(latencytop_enabled)) + __account_scheduler_latency(task, usecs, inter); +} void clear_all_latency_tracing(struct task_struct *p); diff --git a/include/linux/plist.h b/include/linux/plist.h index 85de2f05587..45926d77d6a 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -96,6 +96,10 @@ struct plist_node { # define PLIST_HEAD_LOCK_INIT(_lock) #endif +#define _PLIST_HEAD_INIT(head) \ + .prio_list = LIST_HEAD_INIT((head).prio_list), \ + .node_list = LIST_HEAD_INIT((head).node_list) + /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name @@ -103,8 +107,7 @@ struct plist_node { */ #define PLIST_HEAD_INIT(head, _lock) \ { \ - .prio_list = LIST_HEAD_INIT((head).prio_list), \ - .node_list = LIST_HEAD_INIT((head).node_list), \ + _PLIST_HEAD_INIT(head), \ PLIST_HEAD_LOCK_INIT(&(_lock)) \ } @@ -116,7 +119,7 @@ struct plist_node { #define PLIST_NODE_INIT(node, __prio) \ { \ .prio = (__prio), \ - .plist = PLIST_HEAD_INIT((node).plist, NULL), \ + .plist = { _PLIST_HEAD_INIT((node).plist) }, \ } /** diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c36f62e754..ff904b0606d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -998,6 +998,7 @@ struct sched_class { struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + int (*needs_post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); @@ -1052,6 +1053,10 @@ struct sched_entity { u64 last_wakeup; u64 avg_overlap; + u64 start_runtime; + u64 avg_wakeup; + u64 nr_migrations; + #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; @@ -1067,7 +1072,6 @@ struct sched_entity { u64 exec_max; u64 slice_max; - u64 nr_migrations; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; @@ -1164,6 +1168,7 @@ struct task_struct { #endif struct list_head tasks; + struct plist_node pushable_tasks; struct mm_struct *mm, *active_mm; @@ -1675,6 +1680,16 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) return set_cpus_allowed_ptr(p, &new_mask); } +/* + * Architectures can set this to 1 if they have specified + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, + * but then during bootup it turns out that sched_clock() + * is reliable after all: + */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +extern int sched_clock_stable; +#endif + extern unsigned long long sched_clock(void); extern void sched_clock_init(void); diff --git a/init/Kconfig b/init/Kconfig index 6a5c5fed66c..68699137b14 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -966,7 +966,6 @@ config SLABINFO config RT_MUTEXES boolean - select PLIST config BASE_SMALL int diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 449db466bdb..ca07c5c0c91 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -9,6 +9,44 @@ * as published by the Free Software Foundation; version 2 * of the License. */ + +/* + * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is + * used by the "latencytop" userspace tool. The latency that is tracked is not + * the 'traditional' interrupt latency (which is primarily caused by something + * else consuming CPU), but instead, it is the latency an application encounters + * because the kernel sleeps on its behalf for various reasons. + * + * This code tracks 2 levels of statistics: + * 1) System level latency + * 2) Per process latency + * + * The latency is stored in fixed sized data structures in an accumulated form; + * if the "same" latency cause is hit twice, this will be tracked as one entry + * in the data structure. Both the count, total accumulated latency and maximum + * latency are tracked in this data structure. When the fixed size structure is + * full, no new causes are tracked until the buffer is flushed by writing to + * the /proc file; the userspace tool does this on a regular basis. + * + * A latency cause is identified by a stringified backtrace at the point that + * the scheduler gets invoked. The userland tool will use this string to + * identify the cause of the latency in human readable form. + * + * The information is exported via /proc/latency_stats and /proc/<pid>/latency. + * These files look like this: + * + * Latency Top version : v0.1 + * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl + * | | | | + * | | | +----> the stringified backtrace + * | | +---------> The maximum latency for this entry in microseconds + * | +--------------> The accumulated latency for this entry (microseconds) + * +-------------------> The number of times this entry is hit + * + * (note: the average latency is the accumulated latency divided by the number + * of times) + */ + #include <linux/latencytop.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> @@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record firstnonnull = i; continue; } - for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long record = lat->backtrace[q]; if (latency_record[i].backtrace[q] != record) { @@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record memcpy(&latency_record[i], lat, sizeof(struct latency_record)); } -static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) +/* + * Iterator to store a backtrace into a latency record entry + */ +static inline void store_stacktrace(struct task_struct *tsk, + struct latency_record *lat) { struct stack_trace trace; memset(&trace, 0, sizeof(trace)); trace.max_entries = LT_BACKTRACEDEPTH; trace.entries = &lat->backtrace[0]; - trace.skip = 0; save_stack_trace_tsk(tsk, &trace); } +/** + * __account_scheduler_latency - record an occured latency + * @tsk - the task struct of the task hitting the latency + * @usecs - the duration of the latency in microseconds + * @inter - 1 if the sleep was interruptible, 0 if uninterruptible + * + * This function is the main entry point for recording latency entries + * as called by the scheduler. + * + * This function has a few special cases to deal with normal 'non-latency' + * sleeps: specifically, interruptible sleep longer than 5 msec is skipped + * since this usually is caused by waiting for events via select() and co. + * + * Negative latencies (caused by time going backwards) are also explicitly + * skipped. + */ void __sched -account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) +__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) { unsigned long flags; int i, q; struct latency_record lat; - if (!latencytop_enabled) - return; - /* Long interruptible waits are generally user requested... */ if (inter && usecs > 5000) return; + /* Negative sleeps are time going backwards */ + /* Zero-time sleeps are non-interesting */ + if (usecs <= 0) + return; + memset(&lat, 0, sizeof(lat)); lat.count = 1; lat.time = usecs; @@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) if (tsk->latency_record_count >= LT_SAVECOUNT) goto out_unlock; - for (i = 0; i < LT_SAVECOUNT ; i++) { + for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *mylat; int same = 1; mylat = &tsk->latency_record[i]; - for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long record = lat.backtrace[q]; if (mylat->backtrace[q] != record) { @@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v) for (i = 0; i < MAXLR; i++) { if (latency_record[i].backtrace[0]) { int q; - seq_printf(m, "%i %li %li ", + seq_printf(m, "%i %lu %lu ", latency_record[i].count, latency_record[i].time, latency_record[i].max); @@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp) return single_open(filp, lstats_show, NULL); } -static struct file_operations lstats_fops = { +static const struct file_operations lstats_fops = { .open = lstats_open, .read = seq_read, .write = lstats_write, @@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void) proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } -__initcall(init_lstats_procfs); +device_initcall(init_lstats_procfs); diff --git a/kernel/sched.c b/kernel/sched.c index 8e2558c2ba6..9f8506d68fd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ + return list_empty(&root_task_group.children); +} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) @@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ + return 1; +} +#endif + static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { @@ -467,11 +481,17 @@ struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - int highest_prio; /* highest queued rt task prio */ + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; int overloaded; + struct plist_head pushable_tasks; #endif int rt_throttled; u64 rt_time; @@ -549,7 +569,6 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ unsigned long last_tick_seen; unsigned char in_nohz_recently; @@ -590,6 +609,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned char idle_at_tick; /* For active balancing */ int active_balance; int push_cpu; @@ -618,9 +638,6 @@ struct rq { /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ - unsigned int yld_exp_empty; - unsigned int yld_act_empty; - unsigned int yld_both_empty; unsigned int yld_count; /* schedule() stats */ @@ -1183,10 +1200,10 @@ static void resched_task(struct task_struct *p) assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + if (test_tsk_need_resched(p)) return; - set_tsk_thread_flag(p, TIF_NEED_RESCHED); + set_tsk_need_resched(p); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -1242,7 +1259,7 @@ void wake_up_idle_cpu(int cpu) * lockless. The worst case is that the other CPU runs the * idle task through an additional NOOP schedule() */ - set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + set_tsk_need_resched(rq->idle); /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -1610,21 +1627,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +#ifdef CONFIG_PREEMPT + /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __releases(this_rq->lock) __acquires(busiest->lock) __acquires(this_rq->lock) { int ret = 0; - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -1637,6 +1675,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { @@ -1705,6 +1759,9 @@ static void update_avg(u64 *avg, u64 sample) static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { + if (wakeup) + p->se.start_runtime = p->se.sum_exec_runtime; + sched_info_queued(p); p->sched_class->enqueue_task(rq, p, wakeup); p->se.on_rq = 1; @@ -1712,10 +1769,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - if (sleep && p->se.last_wakeup) { - update_avg(&p->se.avg_overlap, - p->se.sum_exec_runtime - p->se.last_wakeup); - p->se.last_wakeup = 0; + if (sleep) { + if (p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } else { + update_avg(&p->se.avg_wakeup, + sysctl_sched_wakeup_granularity); + } } sched_info_dequeued(p); @@ -2017,7 +2079,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * it must be off the runqueue _entirely_, and not * preempted! * - * So if it wa still runnable (but just not actively + * So if it was still runnable (but just not actively * running right now), it's preempted, and we should * yield - it could be a while. */ @@ -2267,7 +2329,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) sync = 0; #ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE)) { + if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { struct sched_domain *sd; this_cpu = raw_smp_processor_id(); @@ -2345,6 +2407,22 @@ out_activate: activate_task(rq, p, 1); success = 1; + /* + * Only attribute actual wakeups done by this task. + */ + if (!in_interrupt()) { + struct sched_entity *se = ¤t->se; + u64 sample = se->sum_exec_runtime; + + if (se->last_wakeup) + sample -= se->last_wakeup; + else + sample -= se->start_runtime; + update_avg(&se->avg_wakeup, sample); + + se->last_wakeup = se->sum_exec_runtime; + } + out_running: trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); @@ -2355,8 +2433,6 @@ out_running: p->sched_class->task_wake_up(rq, p); #endif out: - current->se.last_wakeup = current->se.sum_exec_runtime; - task_rq_unlock(rq, &flags); return success; @@ -2386,6 +2462,8 @@ static void __sched_fork(struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; + p->se.start_runtime = 0; + p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -2448,6 +2526,8 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + plist_node_init(&p->pushable_tasks, MAX_PRIO); + put_cpu(); } @@ -2491,7 +2571,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) #ifdef CONFIG_PREEMPT_NOTIFIERS /** - * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) @@ -2588,6 +2668,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) { struct mm_struct *mm = rq->prev_mm; long prev_state; +#ifdef CONFIG_SMP + int post_schedule = 0; + + if (current->sched_class->needs_post_schedule) + post_schedule = current->sched_class->needs_post_schedule(rq); +#endif rq->prev_mm = NULL; @@ -2606,7 +2692,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP - if (current->sched_class->post_schedule) + if (post_schedule) current->sched_class->post_schedule(rq); #endif @@ -2913,6 +2999,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { + int tsk_cache_hot = 0; /* * We do not migrate tasks that are: * 1) running (obviously), or @@ -2936,10 +3023,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (!task_hot(p, rq->clock, sd) || - sd->nr_balance_failed > sd->cache_nice_tries) { + tsk_cache_hot = task_hot(p, rq->clock, sd); + if (!tsk_cache_hot || + sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->clock, sd)) { + if (tsk_cache_hot) { schedstat_inc(sd, lb_hot_gained[idle]); schedstat_inc(p, se.nr_forced_migrations); } @@ -2947,7 +3035,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, return 1; } - if (task_hot(p, rq->clock, sd)) { + if (tsk_cache_hot) { schedstat_inc(p, se.nr_failed_migrations_hot); return 0; } @@ -2987,6 +3075,16 @@ next: pulled++; rem_load_move -= p->se.load.weight; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible kernels + * will stop after the first task is pulled to minimize the critical + * section. + */ + if (idle == CPU_NEWLY_IDLE) + goto out; +#endif + /* * We only want to steal up to the prescribed amount of weighted load. */ @@ -3033,9 +3131,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, sd, idle, all_pinned, &this_best_prio); class = class->next; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible + * kernels will stop after the first task is pulled to minimize + * the critical section. + */ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; - +#endif } while (class && max_load_move > total_load_moved); return total_load_moved > 0; @@ -3085,246 +3189,479 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } +/********** Helpers for find_busiest_group ************************/ +/** + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *this; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + |