diff options
117 files changed, 3569 insertions, 1594 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 9d4c1d18ad4..4273b2d71a2 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -355,6 +355,82 @@ utilize. ============================================================== +numa_balancing + +Enables/disables automatic page fault based NUMA memory +balancing. Memory is moved automatically to nodes +that access it often. + +Enables/disables automatic NUMA memory balancing. On NUMA machines, there +is a performance penalty if remote memory is accessed by a CPU. When this +feature is enabled the kernel samples what task thread is accessing memory +by periodically unmapping pages and later trapping a page fault. At the +time of the page fault, it is determined if the data being accessed should +be migrated to a local memory node. + +The unmapping of pages and trapping faults incur additional overhead that +ideally is offset by improved memory locality but there is no universal +guarantee. If the target workload is already bound to NUMA nodes then this +feature should be disabled. Otherwise, if the system overhead from the +feature is too high then the rate the kernel samples for NUMA hinting +faults may be controlled by the numa_balancing_scan_period_min_ms, +numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, +numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and +numa_balancing_migrate_deferred. + +============================================================== + +numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, +numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb + +Automatic NUMA balancing scans tasks address space and unmaps pages to +detect if pages are properly placed or if the data should be migrated to a +memory node local to where the task is running. Every "scan delay" the task +scans the next "scan size" number of pages in its address space. When the +end of the address space is reached the scanner restarts from the beginning. + +In combination, the "scan delay" and "scan size" determine the scan rate. +When "scan delay" decreases, the scan rate increases. The scan delay and +hence the scan rate of every task is adaptive and depends on historical +behaviour. If pages are properly placed then the scan delay increases, +otherwise the scan delay decreases. The "scan size" is not adaptive but +the higher the "scan size", the higher the scan rate. + +Higher scan rates incur higher system overhead as page faults must be +trapped and potentially data must be migrated. However, the higher the scan +rate, the more quickly a tasks memory is migrated to a local node if the +workload pattern changes and minimises performance impact due to remote +memory accesses. These sysctls control the thresholds for scan delays and +the number of pages scanned. + +numa_balancing_scan_period_min_ms is the minimum time in milliseconds to +scan a tasks virtual memory. It effectively controls the maximum scanning +rate for each task. + +numa_balancing_scan_delay_ms is the starting "scan delay" used for a task +when it initially forks. + +numa_balancing_scan_period_max_ms is the maximum time in milliseconds to +scan a tasks virtual memory. It effectively controls the minimum scanning +rate for each task. + +numa_balancing_scan_size_mb is how many megabytes worth of pages are +scanned for a given scan. + +numa_balancing_settle_count is how many scan periods must complete before +the schedule balancer stops pushing the task towards a preferred node. This +gives the scheduler a chance to place the task on an alternative node if the +preferred node is overloaded. + +numa_balancing_migrate_deferred is how many page migrations get skipped +unconditionally, after a page migration is skipped because a page is shared +with other tasks. This reduces page migration overhead, and determines +how much stronger the "move task near its memory" policy scheduler becomes, +versus the "move memory near its task" memory management policy, for workloads +with shared memory. + +============================================================== + osrelease, ostype & version: # cat osrelease diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index ea2d35d64d2..bd365988e8d 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -655,7 +655,11 @@ explains which is which. read the irq flags variable, an 'X' will always be printed here. - need-resched: 'N' task need_resched is set, '.' otherwise. + need-resched: + 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set, + 'n' only TIF_NEED_RESCHED is set, + 'p' only PREEMPT_NEED_RESCHED is set, + '.' otherwise. hardirq/softirq: 'H' - hard irq occurred inside a softirq. diff --git a/MAINTAINERS b/MAINTAINERS index 097db636771..5c746dd83f3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7326,6 +7326,8 @@ S: Maintained F: kernel/sched/ F: include/linux/sched.h F: include/uapi/linux/sched.h +F: kernel/wait.c +F: include/linux/wait.h SCORE ARCHITECTURE M: Chen Liqin <liqin.linux@gmail.com> diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index a6e85f448c1..f01fb505ad5 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -3,3 +3,4 @@ generic-y += clkdev.h generic-y += exec.h generic-y += trace_clock.h +generic-y += preempt.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index d8dd660898b..5943f7f9d32 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -46,3 +46,4 @@ generic-y += ucontext.h generic-y += user.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 59ceae8f3c9..1a7024b4135 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -32,3 +32,4 @@ generic-y += termios.h generic-y += timex.h generic-y += trace_clock.h generic-y += unaligned.h +generic-y += preempt.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 79a642d199f..519f89f5b6a 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -50,3 +50,4 @@ generic-y += unaligned.h generic-y += user.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild index fd798074389..658001b5240 100644 --- a/arch/avr32/include/asm/Kbuild +++ b/arch/avr32/include/asm/Kbuild @@ -7,6 +7,7 @@ generic-y += div64.h generic-y += emergency-restart.h generic-y += exec.h generic-y += futex.h +generic-y += preempt.h generic-y += irq_regs.h generic-y += param.h generic-y += local.h diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index 127826f8a37..f2b43474b0e 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild @@ -44,3 +44,4 @@ generic-y += ucontext.h generic-y += unaligned.h generic-y += user.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index e49f918531a..fc0b3c35602 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -56,3 +56,4 @@ generic-y += ucontext.h generic-y += user.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index c8325455520..b06caf649a9 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -11,3 +11,4 @@ generic-y += module.h generic-y += trace_clock.h generic-y += vga.h generic-y += xor.h +generic-y += preempt.h diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild index c5d76702830..74742dc6a3d 100644 --- a/arch/frv/include/asm/Kbuild +++ b/arch/frv/include/asm/Kbuild @@ - |