aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-09-17 21:00:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-17 21:00:02 -0700
commitdcbf77b9e86e1726f5fbd01bb98820dac06d456e (patch)
tree2f0b728ce70c03e1d0e3461e8a3c3d1fbe68fb90
parentca043a66ae48c74fa628ec92178f7a54f5b9a106 (diff)
parent29cd8bae396583a2ee9a3340db8c5102acf9f6fd (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (37 commits) sched: Fix SD_POWERSAVING_BALANCE|SD_PREFER_LOCAL vs SD_WAKE_AFFINE sched: Stop buddies from hogging the system sched: Add new wakeup preemption mode: WAKEUP_RUNNING sched: Fix TASK_WAKING & loadaverage breakage sched: Disable wakeup balancing sched: Rename flags to wake_flags sched: Clean up the load_idx selection in select_task_rq_fair sched: Optimize cgroup vs wakeup a bit sched: x86: Name old_perf in a unique way sched: Implement a gentler fair-sleepers feature sched: Add SD_PREFER_LOCAL sched: Add a few SYNC hint knobs to play with sched: Fix sync wakeups again sched: Add WF_FORK sched: Rename sync arguments sched: Rename select_task_rq() argument sched: Feature to disable APERF/MPERF cpu_power x86: sched: Provide arch implementations using aperf/mperf x86: Add generic aperf/mperf code x86: Move APERF/MPERF into a X86_FEATURE ... Fix up trivial conflict in arch/x86/include/asm/processor.h due to nearby addition of amd_get_nb_id() declaration from the EDAC merge.
-rw-r--r--arch/ia64/include/asm/topology.h17
-rw-r--r--arch/mips/include/asm/mach-ip27/topology.h1
-rw-r--r--arch/powerpc/include/asm/topology.h9
-rw-r--r--arch/sh/include/asm/topology.h10
-rw-r--r--arch/sparc/include/asm/topology_64.h7
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/processor.h30
-rw-r--r--arch/x86/include/asm/topology.h14
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c88
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/linux/topology.h32
-rw-r--r--include/linux/wait.h4
-rw-r--r--kernel/sched.c444
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c414
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c4
-rw-r--r--kernel/sched_rt.c7
21 files changed, 688 insertions, 603 deletions
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 7b4c8c70b2d..d0141fbf51d 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -61,12 +61,13 @@ void build_cpu_to_node_map(void);
.cache_nice_tries = 2, \
.busy_idx = 2, \
.idle_idx = 1, \
- .newidle_idx = 2, \
- .wake_idx = 1, \
- .forkexec_idx = 1, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \
+ | SD_BALANCE_FORK \
| SD_WAKE_AFFINE, \
.last_balance = jiffies, \
.balance_interval = 1, \
@@ -85,14 +86,14 @@ void build_cpu_to_node_map(void);
.cache_nice_tries = 2, \
.busy_idx = 3, \
.idle_idx = 2, \
- .newidle_idx = 2, \
- .wake_idx = 1, \
- .forkexec_idx = 1, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
.flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \
| SD_BALANCE_FORK \
- | SD_SERIALIZE \
- | SD_WAKE_BALANCE, \
+ | SD_SERIALIZE, \
.last_balance = jiffies, \
.balance_interval = 64, \
.nr_balance_failed = 0, \
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 07547231e07..23059170700 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -48,7 +48,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
.cache_nice_tries = 1, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_EXEC \
- | SD_WAKE_BALANCE, \
.last_balance = jiffies, \
.balance_interval = 1, \
.nr_balance_failed = 0, \
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 054a16d6808..394edcbcce7 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -57,14 +57,13 @@ static inline int pcibus_to_node(struct pci_bus *bus)
.cache_nice_tries = 1, \
.busy_idx = 3, \
.idle_idx = 1, \
- .newidle_idx = 2, \
- .wake_idx = 1, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_EXEC \
+ | SD_BALANCE_FORK \
| SD_BALANCE_NEWIDLE \
- | SD_WAKE_IDLE \
- | SD_SERIALIZE \
- | SD_WAKE_BALANCE, \
+ | SD_SERIALIZE, \
.last_balance = jiffies, \
.balance_interval = 1, \
.nr_balance_failed = 0, \
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index b69ee850906..f8c40cc6505 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -15,14 +15,14 @@
.cache_nice_tries = 2, \
.busy_idx = 3, \
.idle_idx = 2, \
- .newidle_idx = 2, \
- .wake_idx = 1, \
- .forkexec_idx = 1, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_FORK \
| SD_BALANCE_EXEC \
- | SD_SERIALIZE \
- | SD_WAKE_BALANCE, \
+ | SD_BALANCE_NEWIDLE \
+ | SD_SERIALIZE, \
.last_balance = jiffies, \
.balance_interval = 1, \
.nr_balance_failed = 0, \
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index e5ea8d33242..26cd25c0839 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -52,13 +52,12 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
.busy_idx = 3, \
.idle_idx = 2, \
.newidle_idx = 0, \
- .wake_idx = 1, \
- .forkexec_idx = 1, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_FORK \
| SD_BALANCE_EXEC \
- | SD_SERIALIZE \
- | SD_WAKE_BALANCE, \
+ | SD_SERIALIZE, \
.last_balance = jiffies, \
.balance_interval = 1, \
}
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 847fee6493a..9cfc88b9774 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -96,6 +96,7 @@
#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 42a3f936dad..c3429e8b242 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@ struct mm_struct;
#include <linux/cpumask.h>
#include <linux/cache.h>
#include <linux/threads.h>
+#include <linux/math64.h>
#include <linux/init.h>
/*
@@ -1022,4 +1023,33 @@ extern int set_tsc_mode(unsigned int val);
extern int amd_get_nb_id(int cpu);
+struct aperfmperf {
+ u64 aperf, mperf;
+};
+
+static inline void get_aperfmperf(struct aperfmperf *am)
+{
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
+
+ rdmsrl(MSR_IA32_APERF, am->aperf);
+ rdmsrl(MSR_IA32_MPERF, am->mperf);
+}
+
+#define APERFMPERF_SHIFT 10
+
+static inline
+unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
+ struct aperfmperf *new)
+{
+ u64 aperf = new->aperf - old->aperf;
+ u64 mperf = new->mperf - old->mperf;
+ unsigned long ratio = aperf;
+
+ mperf >>= APERFMPERF_SHIFT;
+ if (mperf)
+ ratio = div64_u64(aperf, mperf);
+
+ return ratio;
+}
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 26d06e052a1..6f0695d744b 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,15 +116,11 @@ extern unsigned long node_remap_size[];
# define SD_CACHE_NICE_TRIES 1
# define SD_IDLE_IDX 1
-# define SD_NEWIDLE_IDX 2
-# define SD_FORKEXEC_IDX 0
#else
# define SD_CACHE_NICE_TRIES 2
# define SD_IDLE_IDX 2
-# define SD_NEWIDLE_IDX 2
-# define SD_FORKEXEC_IDX 1
#endif
@@ -137,22 +133,20 @@ extern unsigned long node_remap_size[];
.cache_nice_tries = SD_CACHE_NICE_TRIES, \
.busy_idx = 3, \
.idle_idx = SD_IDLE_IDX, \
- .newidle_idx = SD_NEWIDLE_IDX, \
- .wake_idx = 1, \
- .forkexec_idx = SD_FORKEXEC_IDX, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
\
.flags = 1*SD_LOAD_BALANCE \
| 1*SD_BALANCE_NEWIDLE \
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
- | 0*SD_WAKE_IDLE \
+ | 0*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
- | 1*SD_WAKE_BALANCE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 1*SD_SERIALIZE \
- | 1*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac15..8dd30638fe4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o capflags.o powerflags.o common.o
-obj-y += vmware.o hypervisor.o
+obj-y += vmware.o hypervisor.o sched.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220c..4109679863c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -60,7 +60,6 @@ enum {
};
#define INTEL_MSR_RANGE (0xffff)
-#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
struct acpi_cpufreq_data {
struct acpi_processor_performance *acpi_data;
@@ -71,11 +70,7 @@ struct acpi_cpufreq_data {
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
-struct acpi_msr_data {
- u64 saved_aperf, saved_mperf;
-};
-
-static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
+static DEFINE_PER_CPU(struct aperfmperf, old_perf);
DEFINE_TRACE(power_mark);
@@ -244,23 +239,12 @@ static u32 get_cur_val(const struct cpumask *mask)
return cmd.val;
}
-struct perf_pair {
- union {
- struct {
- u32 lo;
- u32 hi;
- } split;
- u64 whole;
- } aperf, mperf;
-};
-
/* Called via smp_call_function_single(), on the target CPU */
static void read_measured_perf_ctrs(void *_cur)
{
- struct perf_pair *cur = _cur;
+ struct aperfmperf *am = _cur;
- rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
- rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
+ get_aperfmperf(am);
}
/*
@@ -279,63 +263,17 @@ static void read_measured_perf_ctrs(void *_cur)
static unsigned int get_measured_perf(struct cpufreq_policy *policy,
unsigned int cpu)
{
- struct perf_pair readin, cur;
- unsigned int perf_percent;
+ struct aperfmperf perf;
+ unsigned long ratio;
unsigned int retval;
- if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
+ if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
return 0;
- cur.aperf.whole = readin.aperf.whole -
- per_cpu(msr_data, cpu).saved_aperf;
- cur.mperf.whole = readin.mperf.whole -
- per_cpu(msr_data, cpu).saved_mperf;
- per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
- per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
-
-#ifdef __i386__
- /*
- * We dont want to do 64 bit divide with 32 bit kernel
- * Get an approximate value. Return failure in case we cannot get
- * an approximate value.
- */
- if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
- int shift_count;
- u32 h;
-
- h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
- shift_count = fls(h);
-
- cur.aperf.whole >>= shift_count;
- cur.mperf.whole >>= shift_count;
- }
-
- if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
- int shift_count = 7;
- cur.aperf.split.lo >>= shift_count;
- cur.mperf.split.lo >>= shift_count;
- }
-
- if (cur.aperf.split.lo && cur.mperf.split.lo)
- perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
- else
- perf_percent = 0;
+ ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
+ per_cpu(old_perf, cpu) = perf;
-#else
- if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
- int shift_count = 7;
- cur.aperf.whole >>= shift_count;
- cur.mperf.whole >>= shift_count;
- }
-
- if (cur.aperf.whole && cur.mperf.whole)
- perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
- else
- perf_percent = 0;
-
-#endif
-
- retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
+ retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
return retval;
}
@@ -731,12 +669,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
acpi_processor_notify_smm(THIS_MODULE);
/* Check for APERF/MPERF support in hardware */
- if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
- unsigned int ecx;
- ecx = cpuid_ecx(6);
- if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
- acpi_cpufreq_driver.getavg = get_measured_perf;
- }
+ if (cpu_has(c, X86_FEATURE_APERFMPERF))
+ acpi_cpufreq_driver.getavg = get_measured_perf;
dprintk("CPU%u - ACPI performance management activated.\n", cpu);
for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 80a722a071b..40e1835b35e 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
+ if (c->cpuid_level > 6) {
+ unsigned ecx = cpuid_ecx(6);
+ if (ecx & 0x01)
+ set_cpu_cap(c, X86_FEATURE_APERFMPERF);
+ }
+
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (cpu_has_ds) {
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 00000000000..a640ae5ad20
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
+#include <linux/sched.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/irqflags.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
+
+static unsigned long scale_aperfmperf(void)
+{
+ struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
+ unsigned long ratio, flags;
+
+ local_irq_save(flags);
+ get_aperfmperf(&val);
+ local_irq_restore(flags);
+
+ ratio = calc_aperfmperf_ratio(old, &val);
+ *old = val;
+
+ return ratio;
+}
+
+unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+ /*
+ * do aperf/mperf on the cpu level because it includes things
+ * like turbo mode, which are relevant to full cores.
+ */
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ return scale_aperfmperf();
+
+ /*
+ * maybe have something cpufreq here
+ */
+
+ return default_scale_freq_power(sd, cpu);
+}
+
+unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+ /*
+ * aperf/mperf already includes the smt gain
+ */
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ return SCHED_LOAD_SCALE;
+
+ return default_scale_smt_power(sd, cpu);
+}
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f3d74bd04d1..8af3d249170 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -190,6 +190,7 @@ extern unsigned long long time_sync_thresh;
/* in tsk->state again */
#define TASK_DEAD 64
#define TASK_WAKEKILL 128
+#define TASK_WAKING 256
/* Convenience macros for the sake of set_task_state */
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -802,14 +803,14 @@ enum cpu_idle_type {
#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
-#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */
+#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
-#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */
+#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
-#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */
+
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
enum powersavings_balance_level {
@@ -991,6 +992,9 @@ static inline int test_sd_parent(struct sched_domain *sd, int flag)
return 0;
}
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
+
#else /* CONFIG_SMP */
struct sched_domain_attr;
@@ -1002,6 +1006,7 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
}
#endif /* !CONFIG_SMP */
+
struct io_context; /* See blkdev.h */
@@ -1019,6 +1024,12 @@ struct uts_namespace;
struct rq;
struct sched_domain;
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* waker goes to sleep after wakup */
+#define WF_FORK 0x02 /* child wakeup after fork */
+
struct sched_class {
const struct sched_class *next;
@@ -1026,13 +1037,13 @@ struct sched_class {
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
void (*yield_task) (struct rq *rq);
- void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
+ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
struct task_struct * (*pick_next_task) (struct rq *rq);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
- int (*select_task_rq)(struct task_struct *p, int sync);
+ int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
struct rq *busiest, unsigned long max_load_move,
@@ -1102,6 +1113,8 @@ struct sched_entity {
u64 start_runtime;
u64 avg_wakeup;
+ u64 avg_running;
+
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 85e8cf7d393..809b26c0709 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -95,14 +95,12 @@ int arch_update_cpu_topology(void);
| 1*SD_BALANCE_NEWIDLE \
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
- | 0*SD_WAKE_IDLE \
+ | 0*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
- | 1*SD_WAKE_BALANCE \
| 1*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
- | 0*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
@@ -122,20 +120,19 @@ int arch_update_cpu_topology(void);
.imbalance_pct = 125, \
.cache_nice_tries = 1, \
.busy_idx = 2, \
- .wake_idx = 1, \
- .forkexec_idx = 1, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
\
.flags = 1*SD_LOAD_BALANCE \
| 1*SD_BALANCE_NEWIDLE \
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
- | 1*SD_WAKE_IDLE \
+ | 0*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
- | 1*SD_WAKE_BALANCE \
+ | 1*SD_PREFER_LOCAL \
| 0*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
- | 0*SD_WAKE_IDLE_FAR \
| sd_balance_for_mc_power() \
| sd_power_saving_flags() \
, \
@@ -155,21 +152,20 @@ int arch_update_cpu_topology(void);
.cache_nice_tries = 1, \
.busy_idx = 2, \
.idle_idx = 1, \
- .newidle_idx = 2, \
- .wake_idx = 1, \
- .forkexec_idx = 1, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
\
.flags = 1*SD_LOAD_BALANCE \
| 1*SD_BALANCE_NEWIDLE \
| 1*SD_BALANCE_EXEC \
| 1*SD_BALANCE_FORK \
- | 1*SD_WAKE_IDLE \
- | 0*SD_WAKE_AFFINE \
- | 1*SD_WAKE_BALANCE \
+ | 0*SD_BALANCE_WAKE \
+ | 1*SD_WAKE_AFFINE \
+ | 1*SD_PREFER_LOCAL \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
- | 0*SD_WAKE_IDLE_FAR \
| sd_balance_for_package_power() \
| sd_power_saving_flags() \
, \
@@ -191,14 +187,12 @@ int arch_update_cpu_topology(void);
| 1*SD_BALANCE_NEWIDLE \
| 0*SD_BALANCE_EXEC \
| 0*SD_BALANCE_FORK \
- | 0*SD_WAKE_IDLE \
- | 1*SD_WAKE_AFFINE \
- | 0*SD_WAKE_BALANCE \
+ | 0*SD_BALANCE_WAKE \
+ | 0*SD_WAKE_AFFINE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 1*SD_SERIALIZE \
- | 1*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
diff --git a/include/linux/wait.h b/include/linux/wait.h
index cf3c2f5dba5..a48e16b77d5 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -26,8 +26,8 @@
#include <asm/current.h>
typedef struct __wait_queue wait_queue_t;
-typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int sync, void *key);
-int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
+typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
+int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
struct __wait_queue {
unsigned int flags;
diff --git a/kernel/sched.c b/kernel/sched.c
index d9db3fb1757..faf4d463bbf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -119,8 +119,6 @@
*/
#define RUNTIME_INF ((u64)~0ULL)
-static void double_rq_lock(struct rq *rq1, struct rq *rq2);
-
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#else
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
- return 1;
-}
-#endif
-
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
static inline struct task_group *task_group(struct task_struct *p)
{
@@ -514,14 +505,6 @@ struct root_domain {
#ifdef CONFIG_SMP
struct cpupri cpupri;
#endif
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- /*
- * Preferred wake up cpu nominated by sched_mc balance that will be
- * used when most cpus are idle in the system indicating overall very
- * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
- */
- unsigned int sched_mc_preferred_wakeup_cpu;
-#endif
};
/*
@@ -646,9 +629,10 @@ struct rq {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
+static inline
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
- rq->curr->sched_class->check_preempt_curr(rq, p, sync);
+ rq->curr->sched_class->check_preempt_curr(rq, p, flags);
}
static inline int cpu_of(struct rq *rq)
@@ -1509,8 +1493,65 @@ static int tg_nop(struct task_group *tg, void *data)
#endif
#ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+ return cpu_rq(cpu)->load.weight;
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return min(rq->cpu_load[type-1], total);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return max(rq->cpu_load[type-1], total);
+}
+
+static struct sched_group *group_of(int cpu)
+{
+ struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+
+ if (!sd)
+ return NULL;
+
+ return sd->groups;
+}
+
+static unsigned long power_of(int cpu)
+{
+ struct sched_group *group = group_of(cpu);
+
+ if (!group)
+ return SCHED_LOAD_SCALE;
+
+ return group->cpu_power;
+}
+
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1695,6 +1736,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
#ifdef CONFIG_PREEMPT
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
* way at the expense of forcing extra atomic operations in all
@@ -1959,13 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
}
#ifdef CONFIG_SMP
-
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
- return cpu_rq(cpu)->load.weight;
-}
-
/*
* Is this task likely cache-hot:
*/
@@ -2239,185 +2275,6 @@ void kick_process(struct task_struct *p)
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
-
-/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
-
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return min(rq->cpu_load[type-1], total);
-}
-
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
-
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return max(rq->cpu_load[type-1], total);
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- */
-static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
-{
- struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
- unsigned long min_load = ULONG_MAX, this_load = 0;
- int load_idx = sd->forkexec_idx;
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
-
- do {
- unsigned long load, avg_load;
- int local_group;
- int i;
-
- /* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_cpus(group),
- &p->cpus_allowed))
- continue;
-
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
-
- /* Tally up the load of all CPUs in the group */
- avg_load = 0;
-
- for_each_cpu(i, sched_group_cpus(group)) {
- /* Bias balancing toward cpus of our domain */
- if (local_group)
- load = source_load(i, load_idx);
- else
- load = target_load(i, load_idx);
-
- avg_load += load;
- }
-
- /* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
-
- if (local_group) {
- this_load = avg_load;
- this = group;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
- }
- } while (group = group->next, group != sd->groups);
-
- if (!idlest || 100*this_load < imbalance*min_load)
- return NULL;
- return idlest;
-}
-
-/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
- */
-static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
-{
- unsigned long load, min_load = ULONG_MAX;
- int idlest = -1;
- int i;
-
- /* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
- load = weighted_cpuload(i);
-
- if (load < min_load || (load == min_load && i == this_cpu)) {
- min_load = load;
- idlest = i;
- }
- }
-
- return idlest;
-}