aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-03-31 11:21:19 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2014-03-31 11:21:19 -0700
commit971eae7c99212dd67b425a603f1fe3b763359907 (patch)
tree2ff002ecc759275cbecee123a230f90ea7452b18
parent8c292f11744297dfb3a69f4a0bccbe4a6417b50d (diff)
parent6037dd1a49f95092824fa8ba75c717ff7805e317 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "Bigger changes: - sched/idle restructuring: they are WIP preparation for deeper integration between the scheduler and idle state selection, by Nicolas Pitre. - add NUMA scheduling pseudo-interleaving, by Rik van Riel. - optimize cgroup context switches, by Peter Zijlstra. - RT scheduling enhancements, by Thomas Gleixner. The rest is smaller changes, non-urgnt fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (68 commits) sched: Clean up the task_hot() function sched: Remove double calculation in fix_small_imbalance() sched: Fix broken setscheduler() sparc64, sched: Remove unused sparc64_multi_core sched: Remove unused mc_capable() and smt_capable() sched/numa: Move task_numa_free() to __put_task_struct() sched/fair: Fix endless loop in idle_balance() sched/core: Fix endless loop in pick_next_task() sched/fair: Push down check for high priority class task into idle_balance() sched/rt: Fix picking RT and DL tasks from empty queue trace: Replace hardcoding of 19 with MAX_NICE sched: Guarantee task priority in pick_next_task() sched/idle: Remove stale old file sched: Put rq's sched_avg under CONFIG_FAIR_GROUP_SCHED cpuidle/arm64: Remove redundant cpuidle_idle_call() cpuidle/powernv: Remove redundant cpuidle_idle_call() sched, nohz: Exclude isolated cores from load balancing sched: Fix select_task_rq_fair() description comments workqueue: Replace hardcoding of -20 and 19 with MIN_NICE and MAX_NICE sys: Replace hardcoding of -20 and 19 with MIN_NICE and MAX_NICE ...
-rw-r--r--Documentation/sysctl/kernel.txt10
-rw-r--r--arch/arm/include/asm/topology.h3
-rw-r--r--arch/arm/kernel/process.c16
-rw-r--r--arch/arm64/kernel/process.c7
-rw-r--r--arch/ia64/include/asm/topology.h1
-rw-r--r--arch/mips/include/asm/topology.h4
-rw-r--r--arch/powerpc/include/asm/topology.h1
-rw-r--r--arch/powerpc/platforms/cell/spufs/sched.c1
-rw-r--r--arch/powerpc/platforms/powernv/setup.c13
-rw-r--r--arch/powerpc/platforms/pseries/setup.c34
-rw-r--r--arch/sh/kernel/idle.c4
-rw-r--r--arch/sparc/include/asm/smp_64.h1
-rw-r--r--arch/sparc/include/asm/topology_64.h2
-rw-r--r--arch/sparc/kernel/mdesc.c4
-rw-r--r--arch/sparc/kernel/prom_64.c3
-rw-r--r--arch/sparc/kernel/smp_64.c2
-rw-r--r--arch/x86/include/asm/topology.h6
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--drivers/cpuidle/cpuidle-powernv.c5
-rw-r--r--drivers/cpuidle/cpuidle-pseries.c6
-rw-r--r--include/linux/sched.h44
-rw-r--r--include/linux/sched/prio.h44
-rw-r--r--include/linux/sched/rt.h26
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/locking/rtmutex.c12
-rw-r--r--kernel/rcu/rcutorture.c8
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/core.c207
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c56
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c600
-rw-r--r--kernel/sched/idle.c (renamed from kernel/cpu/idle.c)7
-rw-r--r--kernel/sched/idle_task.c25
-rw-r--r--kernel/sched/rt.c102
-rw-r--r--kernel/sched/sched.h65
-rw-r--r--kernel/sched/stop_task.c15
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/workqueue.c2
-rw-r--r--mm/mempolicy.c74
45 files changed, 913 insertions, 541 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index e1d28fbf757..ec8be46bf48 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -442,8 +442,7 @@ feature should be disabled. Otherwise, if the system overhead from the
feature is too high then the rate the kernel samples for NUMA hinting
faults may be controlled by the numa_balancing_scan_period_min_ms,
numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
-numa_balancing_migrate_deferred.
+numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
==============================================================
@@ -484,13 +483,6 @@ rate for each task.
numa_balancing_scan_size_mb is how many megabytes worth of pages are
scanned for a given scan.
-numa_balancing_migrate_deferred is how many page migrations get skipped
-unconditionally, after a page migration is skipped because a page is shared
-with other tasks. This reduces page migration overhead, and determines
-how much stronger the "move task near its memory" policy scheduler becomes,
-versus the "move memory near its task" memory management policy, for workloads
-with shared memory.
-
==============================================================
osrelease, ostype & version:
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 58b8b84adcd..2fe85fff5cc 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -20,9 +20,6 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling)
#define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
-#define mc_capable() (cpu_topology[0].socket_id != -1)
-#define smt_capable() (cpu_topology[0].thread_id != -1)
-
void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 92f7b15dd22..adabeababeb 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -30,7 +30,6 @@
#include <linux/uaccess.h>
#include <linux/random.h>
#include <linux/hw_breakpoint.h>
-#include <linux/cpuidle.h>
#include <linux/leds.h>
#include <linux/reboot.h>
@@ -133,7 +132,11 @@ EXPORT_SYMBOL_GPL(arm_pm_restart);
void (*arm_pm_idle)(void);
-static void default_idle(void)
+/*
+ * Called from the core idle loop.
+ */
+
+void arch_cpu_idle(void)
{
if (arm_pm_idle)
arm_pm_idle();
@@ -168,15 +171,6 @@ void arch_cpu_idle_dead(void)
#endif
/*
- * Called from the core idle loop.
- */
-void arch_cpu_idle(void)
-{
- if (cpuidle_idle_call())
- default_idle();
-}
-
-/*
* Called by kexec, immediately prior to machine_kexec().
*
* This must completely disable all secondary CPUs; simply causing those CPUs
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 1c0a9be2ffa..9cce0098f4c 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -33,7 +33,6 @@
#include <linux/kallsyms.h>
#include <linux/init.h>
#include <linux/cpu.h>
-#include <linux/cpuidle.h>
#include <linux/elfcore.h>
#include <linux/pm.h>
#include <linux/tick.h>
@@ -94,10 +93,8 @@ void arch_cpu_idle(void)
* This should do all the clock switching and wait for interrupt
* tricks
*/
- if (cpuidle_idle_call()) {
- cpu_do_idle();
- local_irq_enable();
- }
+ cpu_do_idle();
+ local_irq_enable();
}
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index a2496e449b7..5cb55a1e606 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -77,7 +77,6 @@ void build_cpu_to_node_map(void);
#define topology_core_id(cpu) (cpu_data(cpu)->core_id)
#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
-#define smt_capable() (smp_num_siblings > 1)
#endif
extern void arch_fix_phys_package_id(int num, u32 slot);
diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h
index 12609a17dc8..20ea4859c82 100644
--- a/arch/mips/include/asm/topology.h
+++ b/arch/mips/include/asm/topology.h
@@ -10,8 +10,4 @@
#include <topology.h>
-#ifdef CONFIG_SMP
-#define smt_capable() (smp_num_siblings > 1)
-#endif
-
#endif /* __ASM_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index d0b5fca6b07..c9202151079 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -99,7 +99,6 @@ static inline int prrn_is_enabled(void)
#ifdef CONFIG_SMP
#include <asm/cputable.h>
-#define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
#ifdef CONFIG_PPC64
#include <asm/smp.h>
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 49318385d4f..4a0a64fe25d 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -83,7 +83,6 @@ static struct timer_list spuloadavg_timer;
#define MIN_SPU_TIMESLICE max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
#define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK))
-#define MAX_USER_PRIO (MAX_PRIO - MAX_RT_PRIO)
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 110f4fbd319..81a7a0a79be 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -26,7 +26,6 @@
#include <linux/of_fdt.h>
#include <linux/interrupt.h>
#include <linux/bug.h>
-#include <linux/cpuidle.h>
#include <linux/pci.h>
#include <asm/machdep.h>
@@ -225,16 +224,6 @@ static int __init pnv_probe(void)
return 1;
}
-void powernv_idle(void)
-{
- /* Hook to cpuidle framework if available, else
- * call on default platform idle code
- */
- if (cpuidle_idle_call()) {
- power7_idle();
- }
-}
-
define_machine(powernv) {
.name = "PowerNV",
.probe = pnv_probe,
@@ -244,7 +233,7 @@ define_machine(powernv) {
.show_cpuinfo = pnv_show_cpuinfo,
.progress = pnv_progress,
.machine_shutdown = pnv_shutdown,
- .power_save = powernv_idle,
+ .power_save = power7_idle,
.calibrate_decr = generic_calibrate_decr,
.dma_set_mask = pnv_dma_set_mask,
#ifdef CONFIG_KEXEC
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 972df0ffd4d..2db8cc691bf 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -39,7 +39,6 @@
#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/root_dev.h>
-#include <linux/cpuidle.h>
#include <linux/of.h>
#include <linux/kexec.h>
@@ -356,29 +355,24 @@ early_initcall(alloc_dispatch_log_kmem_cache);
static void pseries_lpar_idle(void)
{
- /* This would call on the cpuidle framework, and the back-end pseries
- * driver to go to idle states
+ /*
+ * Default handler to go into low thread priority and possibly
+ * low power mode by cedeing processor to hypervisor
*/
- if (cpuidle_idle_call()) {
- /* On error, execute default handler
- * to go into low thread priority and possibly
- * low power mode by cedeing processor to hypervisor
- */
- /* Indicate to hypervisor that we are idle. */
- get_lppaca()->idle = 1;
+ /* Indicate to hypervisor that we are idle. */
+ get_lppaca()->idle = 1;
- /*
- * Yield the processor to the hypervisor. We return if
- * an external interrupt occurs (which are driven prior
- * to returning here) or if a prod occurs from another
- * processor. When returning here, external interrupts
- * are enabled.
- */
- cede_processor();
+ /*
+ * Yield the processor to the hypervisor. We return if
+ * an external interrupt occurs (which are driven prior
+ * to returning here) or if a prod occurs from another
+ * processor. When returning here, external interrupts
+ * are enabled.
+ */
+ cede_processor();
- get_lppaca()->idle = 0;
- }
+ get_lppaca()->idle = 0;
}
/*
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 2ea4483fd72..be616ee0cf8 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -16,7 +16,6 @@
#include <linux/thread_info.h>
#include <linux/irqflags.h>
#include <linux/smp.h>
-#include <linux/cpuidle.h>
#include <linux/atomic.h>
#include <asm/pgalloc.h>
#include <asm/smp.h>
@@ -40,8 +39,7 @@ void arch_cpu_idle_dead(void)
void arch_cpu_idle(void)
{
- if (cpuidle_idle_call())
- sh_idle();
+ sh_idle();
}
void __init select_idle_routine(void)
diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h
index dd3bef4b989..05710393959 100644
--- a/arch/sparc/include/asm/smp_64.h
+++ b/arch/sparc/include/asm/smp_64.h
@@ -32,7 +32,6 @@
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
extern cpumask_t cpu_core_map[NR_CPUS];
-extern int sparc64_multi_core;
extern void arch_send_call_function_single_ipi(int cpu);
extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 1754390a426..a2d10fc64fa 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -42,8 +42,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
#define topology_core_id(cpu) (cpu_data(cpu).core_id)
#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
-#define mc_capable() (sparc64_multi_core)
-#define smt_capable() (sparc64_multi_core)
#endif /* CONFIG_SMP */
extern cpumask_t cpu_core_map[NR_CPUS];
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index b90bf23e3aa..a1a4400d402 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -896,10 +896,6 @@ void mdesc_fill_in_cpu_data(cpumask_t *mask)
mdesc_iterate_over_cpus(fill_in_one_cpu, NULL, mask);
-#ifdef CONFIG_SMP
- sparc64_multi_core = 1;
-#endif
-
hp = mdesc_grab();
set_core_ids(hp);
diff --git a/arch/sparc/kernel/prom_64.c b/arch/sparc/kernel/prom_64.c
index 6b39125eb92..9a690d39c01 100644
--- a/arch/sparc/kernel/prom_64.c
+++ b/arch/sparc/kernel/prom_64.c
@@ -555,9 +555,6 @@ static void *fill_in_one_cpu(struct device_node *dp, int cpuid, int arg)
cpu_data(cpuid).core_id = portid + 1;
cpu_data(cpuid).proc_id = portid;
-#ifdef CONFIG_SMP
- sparc64_multi_core = 1;
-#endif
} else {
cpu_data(cpuid).dcache_size =
of_getintprop_default(dp, "dcache-size", 16 * 1024);
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index b085311dcd0..9781048161a 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -53,8 +53,6 @@
#include "cpumap.h"
-int sparc64_multi_core __read_mostly;
-
DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 1306d117967..b28097e4c8c 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -134,12 +134,6 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
struct pci_bus;
void x86_pci_root_bus_resources(int bus, struct list_head *resources);
-#ifdef CONFIG_SMP
-#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
- (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
-#define smt_capable() (smp_num_siblings > 1)
-#endif
-
#ifdef CONFIG_NUMA
extern int get_mp_bus_to_node(int busnum);
extern void set_mp_bus_to_node(int busnum, int node);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95ab8b..4505e2a950d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void)
*/
void arch_cpu_idle(void)
{
- if (cpuidle_idle_call())
- x86_idle();
- else
- local_irq_enable();
+ x86_idle();
}
/*
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 78fd174c57e..f48607cd254 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -14,6 +14,7 @@
#include <asm/machdep.h>
#include <asm/firmware.h>
+#include <asm/runlatch.h>
struct cpuidle_driver powernv_idle_driver = {
.name = "powernv_idle",
@@ -30,12 +31,14 @@ static int snooze_loop(struct cpuidle_device *dev,
local_irq_enable();
set_thread_flag(TIF_POLLING_NRFLAG);
+ ppc64_runlatch_off();
while (!need_resched()) {
HMT_low();
HMT_very_low();
}
HMT_medium();
+ ppc64_runlatch_on();
clear_thread_flag(TIF_POLLING_NRFLAG);
smp_mb();
return index;
@@ -45,7 +48,9 @@ static int nap_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
{
+ ppc64_runlatch_off();
power7_idle();
+ ppc64_runlatch_on();
return index;
}
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 7ab564aa0b1..6f7b0195688 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -17,6 +17,7 @@
#include <asm/reg.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
+#include <asm/runlatch.h>
#include <asm/plpar_wrappers.h>
struct cpuidle_driver pseries_idle_driver = {
@@ -29,6 +30,7 @@ static struct cpuidle_state *cpuidle_state_table;
static inline void idle_loop_prolog(unsigned long *in_purr)
{
+ ppc64_runlatch_off();
*in_purr = mfspr(SPRN_PURR);
/*
* Indicate to the HV that we are idle. Now would be
@@ -45,6 +47,10 @@ static inline void idle_loop_epilog(unsigned long in_purr)
wait_cycles += mfspr(SPRN_PURR) - in_purr;
get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
get_lppaca()->idle = 0;
+
+ if (irqs_disabled())
+ local_irq_enable();
+ ppc64_runlatch_on();
}
static int snooze_loop(struct cpuidle_device *dev,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec1cd0..825ed838d4b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3,6 +3,8 @@
#include <uapi/linux/sched.h>
+#include <linux/sched/prio.h>
+
struct sched_param {
int sched_priority;
@@ -1077,6 +1079,7 @@ struct sched_entity {
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+ int depth;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
@@ -1460,6 +1463,9 @@ struct task_struct {
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
+#ifdef CONFIG_DEBUG_PREEMPT
+ unsigned long preempt_disable_ip;
+#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy; /* Protected by alloc_lock */
short il_next;
@@ -1470,9 +1476,10 @@ struct task_struct {
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
- int numa_migrate_deferred;
unsigned long numa_migrate_retry;
u64 node_stamp; /* migration stamp */
+ u64 last_task_numa_placement;
+ u64 last_sum_exec_runtime;
struct callback_head numa_work;
struct list_head numa_entry;
@@ -1483,15 +1490,22 @@ struct task_struct {
* Scheduling placement decisions are made based on the these counts.
* The values remain static for the duration of a PTE scan
*/
- unsigned long *numa_faults;
+ unsigned long *numa_faults_memory;
unsigned long total_numa_faults;
/*
* numa_faults_buffer records faults per node during the current
- * scan window. When the scan completes, the counts in numa_faults
- * decay and these values are copied.
+ * scan window. When the scan completes, the counts in
+ * numa_faults_memory decay and these values are copied.
+ */
+ unsigned long *numa_faults_buffer_memory;
+
+ /*
+ * Track the nodes the process was running on when a NUMA hinting
+ * fault was incurred.
*/
- unsigned long *numa_faults_buffer;
+ unsigned long *numa_faults_cpu;
+ unsigned long *numa_faults_buffer_cpu;
/*
* numa_faults_locality tracks if faults recorded during the last
@@ -1596,8 +1610,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled);
extern void task_numa_free(struct task_struct *p);
-
-extern unsigned int sysctl_numa_balancing_migrate_deferred;
+extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
+ int src_nid, int dst_cpu);
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
@@ -1613,6 +1627,11 @@ static inline void set_numabalancing_state(bool enabled)
static inline void task_numa_free(struct task_struct *p)
{
}
+static inline bool should_numa_migrate_memory(struct task_struct *p,
+ struct page *page, int src_nid, int dst_cpu)
+{
+ return true;
+}
#endif
static inline struct pid *task_pid(struct task_struct *task)
@@ -2080,7 +2099,16 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
extern bool yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
-extern int task_nice(const struct task_struct *p);
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
+ */
+static inline int task_nice(const struct task_struct *p)
+{
+ return PRIO_TO_NICE((p)->static_prio);
+}
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
new file mode 100644
index 00000000000..ac322583c82
--- /dev/null
+++ b/include/linux/sched/prio.h
@@ -0,0 +1,44 @@
+#ifndef _SCHED_PRIO_H
+#define _SCHED_PRIO_H
+
+#define MAX_NICE 19
+#define MIN_NICE -20
+#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1)
+
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space. This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO 100
+#define MAX_RT_PRIO MAX_USER_RT_PRIO
+
+#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
+#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO)
+#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+
+#endif /* _SCHED_PRIO_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 34e4ebea8fc..6341f5be6e2 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -1,24 +1,7 @@
#ifndef _SCHED_RT_H
#define _SCHED_RT_H
-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space. This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-
-#define MAX_USER_RT_PRIO 100
-#define MAX_RT_PRIO MAX_USER_RT_PRIO
-
-#define MAX_PRIO (MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
+#include <linux/sched/prio.h>
static inline int rt_prio(int prio)
{
@@ -35,6 +18,7 @@ static inline int rt_task(struct task_struct *p)
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
+extern int rt_mutex_check_prio(struct task_struct *task, int newprio);
extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
@@ -46,6 +30,12 @@ static inline int rt_mutex_getprio(struct task_struct *p)
{
return p->normal_prio;
}
+
+static inline int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+ return 0;
+}
+
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
return NULL;
diff --git a/kernel/Makefile b/kernel/Makefile
index 5c0e7666811..4fd847488b7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-y += sched/
obj-y += locking/
obj-y += power/
obj-y += printk/
-obj-y += cpu/
obj-y += irq/
obj-y += rcu/
<