aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c118
-rw-r--r--arch/x86/mm/fault.c156
-rw-r--r--arch/x86/mm/gup.c21
-rw-r--r--arch/x86/mm/hugetlbpage.c19
-rw-r--r--arch/x86/mm/init.c148
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c61
-rw-r--r--arch/x86/mm/ioremap.c254
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c8
-rw-r--r--arch/x86/mm/kmmio.c1
-rw-r--r--arch/x86/mm/memtest.c2
-rw-r--r--arch/x86/mm/numa.c80
-rw-r--r--arch/x86/mm/numa_32.c2
-rw-r--r--arch/x86/mm/pageattr-test.c3
-rw-r--r--arch/x86/mm/pageattr.c493
-rw-r--r--arch/x86/mm/pgtable.c50
-rw-r--r--arch/x86/mm/pgtable_32.c2
-rw-r--r--arch/x86/mm/srat.c29
-rw-r--r--arch/x86/mm/tlb.c52
20 files changed, 1045 insertions, 461 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 23d8e5fecf7..6a19ad9f370 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
obj-$(CONFIG_SMP) += tlb.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0002a3a3308..167ffcac16e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,11 +30,14 @@ struct pg_state {
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
+ unsigned long lines;
+ bool to_dmesg;
};
struct addr_marker {
unsigned long start_address;
const char *name;
+ unsigned long max_lines;
};
/* indices for address_markers; keep sync'd w/ address_markers below */
@@ -45,6 +48,7 @@ enum address_markers_idx {
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
+ ESPFIX_START_NR,
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
@@ -67,6 +71,7 @@ static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
@@ -88,10 +93,28 @@ static struct addr_marker address_markers[] = {
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_INFO fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_CONT fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
/*
* Print a readable form of a pgprot_t to the seq_file
*/
-static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
{
pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
@@ -99,47 +122,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
if (!pgprot_val(prot)) {
/* Not present */
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
} else {
if (pr & _PAGE_USER)
- seq_printf(m, "USR ");
+ pt_dump_cont_printf(m, dmsg, "USR ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_RW)
- seq_printf(m, "RW ");
+ pt_dump_cont_printf(m, dmsg, "RW ");
else
- seq_printf(m, "ro ");
+ pt_dump_cont_printf(m, dmsg, "ro ");
if (pr & _PAGE_PWT)
- seq_printf(m, "PWT ");
+ pt_dump_cont_printf(m, dmsg, "PWT ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_PCD)
- seq_printf(m, "PCD ");
+ pt_dump_cont_printf(m, dmsg, "PCD ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
/* Bit 9 has a different meaning on level 3 vs 4 */
if (level <= 3) {
if (pr & _PAGE_PSE)
- seq_printf(m, "PSE ");
+ pt_dump_cont_printf(m, dmsg, "PSE ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
} else {
if (pr & _PAGE_PAT)
- seq_printf(m, "pat ");
+ pt_dump_cont_printf(m, dmsg, "pat ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
}
if (pr & _PAGE_GLOBAL)
- seq_printf(m, "GLB ");
+ pt_dump_cont_printf(m, dmsg, "GLB ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_NX)
- seq_printf(m, "NX ");
+ pt_dump_cont_printf(m, dmsg, "NX ");
else
- seq_printf(m, "x ");
+ pt_dump_cont_printf(m, dmsg, "x ");
}
- seq_printf(m, "%s\n", level_name[level]);
+ pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
}
/*
@@ -163,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
pgprot_t new_prot, int level)
{
pgprotval_t prot, cur;
- static const char units[] = "KMGTPE";
+ static const char units[] = "BKMGTPE";
/*
* If we have a "break" in the series, we need to flush the state that
@@ -178,7 +201,9 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
@@ -188,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st,
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
-
- delta = (st->current_address - st->start_address) >> 10;
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
+ if (!st->marker->max_lines ||
+ st->lines < st->marker->max_lines) {
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
+
+ delta = st->current_address - st->start_address;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
+ delta, *unit);
+ printk_prot(m, st->current_prot, st->level,
+ st->to_dmesg);
}
- seq_printf(m, "%9lu%c ", delta, *unit);
- printk_prot(m, st->current_prot, st->level);
+ st->lines++;
/*
* We print markers for special areas of address space,
@@ -206,8 +238,19 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* This helps in the interpretation.
*/
if (st->current_address >= st->marker[1].start_address) {
+ if (st->marker->max_lines &&
+ st->lines > st->marker->max_lines) {
+ unsigned long nskip =
+ st->lines - st->marker->max_lines;
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "... %lu entr%s skipped ... \n",
+ nskip,
+ nskip == 1 ? "y" : "ies");
+ }
st->marker++;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
}
st->start_address = st->current_address;
@@ -296,7 +339,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
-static void walk_pgd_level(struct seq_file *m)
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt;
@@ -304,9 +347,12 @@ static void walk_pgd_level(struct seq_file *m)
pgd_t *start = swapper_pg_dir;
#endif
int i;
- struct pg_state st;
+ struct pg_state st = {};
- memset(&st, 0, sizeof(st));
+ if (pgd) {
+ start = pgd;
+ st.to_dmesg = true;
+ }
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
@@ -331,7 +377,7 @@ static void walk_pgd_level(struct seq_file *m)
static int ptdump_show(struct seq_file *m, void *v)
{
- walk_pgd_level(m);
+ ptdump_walk_pgd_level(m, NULL);
return 0;
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3aaeffcfd67..36642793e31 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,7 +8,7 @@
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/module.h> /* search_exception_table */
#include <linux/bootmem.h> /* max_low_pfn */
-#include <linux/kprobes.h> /* __kprobes, ... */
+#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
@@ -18,7 +18,11 @@
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
-#include <asm/fixmap.h> /* VSYSCALL_START */
+#include <asm/fixmap.h> /* VSYSCALL_ADDR */
+#include <asm/vsyscall.h> /* emulate_vsyscall */
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
/*
* Page fault error code bits:
@@ -42,7 +46,7 @@ enum x86_pf_error_code {
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
-static inline int __kprobes
+static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
if (unlikely(is_kmmio_active()))
@@ -51,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
return 0;
}
-static inline int __kprobes notify_page_fault(struct pt_regs *regs)
+static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
{
int ret = 0;
@@ -258,7 +262,7 @@ void vmalloc_sync_all(void)
*
* Handle a fault on the vmalloc or module mapping area
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
@@ -288,6 +292,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
/*
* Did it hit the DOS screen memory VA from vm86 mode?
@@ -355,7 +360,7 @@ void vmalloc_sync_all(void)
*
* This assumes no large pages in there.
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
pud_t *pud, *pud_ref;
@@ -422,6 +427,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
@@ -581,8 +587,13 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (error_code & PF_INSTR) {
unsigned int level;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd += pgd_index(address);
- pte_t *pte = lookup_address(address, &level);
+ pte = lookup_address_in_pgd(pgd, address, &level);
if (pte && pte_present(*pte) && !pte_exec(*pte))
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
@@ -596,7 +607,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(KERN_CONT " at %p\n", (void *) address);
printk(KERN_ALERT "IP:");
- printk_address(regs->ip, 1);
+ printk_address(regs->ip);
dump_pagetable(address);
}
@@ -638,6 +649,20 @@ no_context(struct pt_regs *regs, unsigned long error_code,
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
if (current_thread_info()->sig_on_uaccess_error && signal) {
tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code | PF_USER;
@@ -646,6 +671,10 @@ no_context(struct pt_regs *regs, unsigned long error_code,
/* XXX: hwpoison faults will set the wrong code. */
force_sig_info_fault(signal, si_code, address, tsk, 0);
}
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
return;
}
@@ -745,7 +774,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* emulation.
*/
if (unlikely((error_code & PF_INSTR) &&
- ((address & ~0xfff) == VSYSCALL_START))) {
+ ((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
}
@@ -901,7 +930,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static noinline __kprobes int
+static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
@@ -949,6 +978,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
return ret;
}
+NOKPROBE_SYMBOL(spurious_fault);
int show_unhandled_signals = 1;
@@ -980,6 +1010,12 @@ static int fault_in_kernel_space(unsigned long address)
static inline bool smap_violation(int error_code, struct pt_regs *regs)
{
+ if (!IS_ENABLED(CONFIG_X86_SMAP))
+ return false;
+
+ if (!static_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
if (error_code & PF_USER)
return false;
@@ -993,13 +1029,17 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
+ *
+ * This function must have noinline because both callers
+ * {,trace_}do_page_fault() have notrace on. Having this an actual function
+ * guarantees there's a function trace entry.
*/
-static void __kprobes
-__do_page_fault(struct pt_regs *regs, unsigned long error_code)
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- unsigned long address;
struct mm_struct *mm;
int fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -1007,9 +1047,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
tsk = current;
mm = tsk->mm;
- /* Get the faulting address: */
- address = read_cr2();
-
/*
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
@@ -1048,7 +1085,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
return;
/* kprobes don't want to hook the spurious faults: */
- if (notify_page_fault(regs))
+ if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
@@ -1060,8 +1097,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
/* kprobes don't want to hook the spurious faults: */
- if (unlikely(notify_page_fault(regs)))
+ if (unlikely(kprobes_fault(regs)))
return;
+
+ if (unlikely(error_code & PF_RSVD))
+ pgtable_bad(regs, error_code, address);
+
+ if (unlikely(smap_violation(error_code, regs))) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * If we're in an interrupt, have no user context or are running
+ * in an atomic region then we must not take the fault:
+ */
+ if (unlikely(in_atomic() || !mm)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
@@ -1078,27 +1133,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
local_irq_enable();
}
- if (unlikely(error_code & PF_RSVD))
- pgtable_bad(regs, error_code, address);
-
- if (static_cpu_has(X86_FEATURE_SMAP)) {
- if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address);
- return;
- }
- }
-
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- /*
- * If we're in an interrupt, have no user context or are running
- * in an atomic region then we must not take the fault:
- */
- if (unlikely(in_atomic() || !mm)) {
- bad_area_nosemaphore(regs, error_code, address);
- return;
- }
-
if (error_code & PF_WRITE)
flags |= FAULT_FLAG_WRITE;
@@ -1221,13 +1257,55 @@ good_area:
up_read(&mm->mmap_sem);
}
+NOKPROBE_SYMBOL(__do_page_fault);
-dotraplinkage void __kprobes
+dotraplinkage void notrace
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
+ unsigned long address = read_cr2(); /* Get the faulting address */
+ enum ctx_state prev_state;
+
+ /*
+ * We must have this function tagged with __kprobes, notrace and call
+ * read_cr2() before calling anything else. To avoid calling any kind
+ * of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contain all sorts of tracepoints.
+ */
+
+ prev_state = exception_enter();
+ __do_page_fault(regs, error_code, address);
+ exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(do_page_fault);
+
+#ifdef CONFIG_TRACING
+static nokprobe_inline void
+trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_page_fault_user(address, regs, error_code);
+ else
+ trace_page_fault_kernel(address, regs, error_code);
+}
+
+dotraplinkage void notrace
+trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ /*
+ * The exception_enter and tracepoint processing could
+ * trigger another page faults (user space callchain
+ * reading) and destroy the original cr2 value, so read
+ * the faulting address now.
+ */
+ unsigned long address = read_cr2();
enum ctx_state prev_state;
prev_state = exception_enter();
- __do_page_fault(regs, error_code);
+ trace_page_fault_entries(address, regs, error_code);
+ __do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(trace_do_page_fault);
+#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dd74e46828c..207d9aef662 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_t pte = gup_get_pte(ptep);
struct page *page;
+ /* Similar to the PMD case, NUMA hinting must take slow path */
+ if (pte_numa(pte)) {
+ pte_unmap(ptep);
+ return 0;
+ }
+
if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
@@ -102,8 +108,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
static inline void get_head_page_multiple(struct page *page, int nr)
{
- VM_BUG_ON(page != compound_head(page));
- VM_BUG_ON(page_count(page) == 0);
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
+ VM_BUG_ON_PAGE(page_count(page) == 0, page);
atomic_add(nr, &page->_count);
SetPageReferenced(page);
}
@@ -129,7 +135,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON(compound_head(page) != head);
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
if (PageTail(page))
get_huge_page_tail(page);
@@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_numa(pmd))
+ return 0;
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
return 0;
} else {
@@ -199,7 +212,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON(compound_head(page) != head);
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
if (PageTail(page))
get_huge_page_tail(page);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 9d980d88b74..8b977ebf938 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -58,11 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
{
return NULL;
}
-
-int pmd_huge_support(void)
-{
- return 0;
-}
#else
struct page *
@@ -80,16 +75,9 @@ int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
-
-int pmd_huge_support(void)
-{
- return 1;
-}
#endif
-/* x86_64 also uses this file */
-
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
@@ -99,7 +87,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
info.flags = 0;
info.length = len;
- info.low_limit = TASK_UNMAPPED_BASE;
+ info.low_limit = current->mm->mmap_legacy_base;
info.high_limit = TASK_SIZE;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
@@ -172,8 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
}
-
-#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 04664cdb7fd..f9713061811 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -53,12 +53,12 @@ __ref void *alloc_low_pages(unsigned int num)
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
unsigned long ret;
if (min_pfn_mapped >= max_pfn_mapped)
- panic("alloc_low_page: ran out of memory");
+ panic("alloc_low_pages: ran out of memory");
ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
if (!ret)
- panic("alloc_low_page: can not alloc memory");
+ panic("alloc_low_pages: can not alloc memory");
memblock_reserve(ret, PAGE_SIZE * num);
pfn = ret >> PAGE_SHIFT;
} else {
@@ -399,29 +399,46 @@ static unsigned long __init init_range_memory_mapping(
return mapped_ram_size;
}
-/* (PUD_SHIFT-PMD_SHIFT)/2 */
-#define STEP_SIZE_SHIFT 5
-void __init init_mem_mapping(void)
+static unsigned long __init get_new_step_size(unsigned long step_size)
+{
+ /*
+ * Explain why we shift by 5 and why we don't have to worry about
+ * 'step_size << 5' overflowing:
+ *
+ * initial mapped size is PMD_SIZE (2M).
+ * We can not set step_size to be PUD_SIZE (1G) yet.
+ * In worse case, when we cross the 1G boundary, and
+ * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
+ * to map 1G range with PTE. Use 5 as shift for now.
+ *
+ * Don't need to worry about overflow, on 32bit, when step_size
+ * is 0, round_down() returns 0 for start, and that turns it
+ * into 0x100000000ULL.
+ */
+ return step_size << 5;
+}
+
+/**
+ * memory_map_top_down - Map [map_start, map_end) top down
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in top-down. That said, the page tables
+ * will be allocated at the end of the memory, and we map the
+ * memory in top-down.
+ */
+static void __init memory_map_top_down(unsigned long map_start,
+ unsigned long map_end)
{
- unsigned long end, real_end, start, last_start;
+ unsigned long real_end, start, last_start;
unsigned long step_size;
unsigned long addr;
unsigned long mapped_ram_size = 0;
unsigned long new_mapped_ram_size;
- probe_page_size_mask();
-
-#ifdef CONFIG_X86_64
- end = max_pfn << PAGE_SHIFT;
-#else
- end = max_low_pfn << PAGE_SHIFT;
-#endif
-
- /* the ISA range is always mapped regardless of memory holes */
- init_memory_mapping(0, ISA_END_ADDRESS);
-
/* xen has big range in reserved near end of ram, skip it at first.*/
- addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE);
+ addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
real_end = addr + PMD_SIZE;
/* step_size need to be small so pgt_buf from BRK could cover it */
@@ -436,25 +453,106 @@ void __init init_mem_mapping(void)
* end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
* for page table.
*/
- while (last_start > ISA_END_ADDRESS) {
+ while (last_start > map_start) {
if (last_start > step_size) {
start = round_down(last_start - 1, step_size);
- if (start < ISA_END_ADDRESS)
- start = ISA_END_ADDRESS;
+ if (start < map_start)
+ start = map_start;
} else
- start = ISA_END_ADDRESS;
+ start = map_start;
new_mapped_ram_size = init_range_memory_mapping(start,
last_start);
last_start = start;
min_pfn_mapped = last_start >> PAGE_SHIFT;
/* only increase step_size after big range get mapped */
if (new_mapped_ram_size > mapped_ram_size)
- step_size <<= STEP_SIZE_SHIFT;
+ step_size = get_new_step_size(step_size);
mapped_ram_size += new_mapped_ram_size;
}
- if (real_end < end)
- init_range_memory_mapping(real_end, end);
+ if (real_end < map_end)
+ init_range_memory_mapping(real_end, map_end);
+}
+
+/**
+ * memory_map_bottom_up - Map [map_start, map_end) bottom up
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in bottom-up. Since we have limited the
+ * bottom-up allocation above the kernel, the page tables will
+ * be allocated just above the kernel and we map the memory
+ * in [map_start, map_end) in bottom-up.
+ */
+static void __init memory_map_bottom_up(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long next, new_mapped_ram_size, start;
+ unsigned long mapped_ram_size = 0;
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ unsigned long step_size = PMD_SIZE;
+
+ start = map_start;
+ min_pfn_mapped = start >> PAGE_SHIFT;
+
+ /*
+ * We start from the bottom (@map_start) and go to the top (@map_end).
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (start < map_end) {
+ if (map_end - start > step_size) {
+ next = round_up(start + 1, step_size);
+ if (next > map_end)
+ next = map_end;
+ } else
+ next = map_end;
+
+ new_mapped_ram_size = init_range_memory_mapping(start, next);
+ start = next;
+
+ if (new_mapped_ram_size > mapped_ram_size)
+ step_size = get_new_step_size(step_size);
+ mapped_ram_size += new_mapped_ram_size;
+ }
+}
+
+void __init init_mem_mapping(void)
+{
+ unsigned long end;
+
+ probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+ end = max_pfn << PAGE_SHIFT;
+#else
+ end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+ /* the ISA range is always mapped regardless of memory holes */
+ init_memory_mapping(0, ISA_END_ADDRESS);
+
+ /*
+ * If the allocation is in bottom-up direction, we setup direct mapping
+ * in bottom-up, otherwise we setup direct mapping in top-down.
+ */
+ if (memblock_bottom_up()) {
+ unsigned long kernel_end = __pa_symbol(_end);
+
+ /*
+ * we need two separate calls here. This is because we want to
+ * allocate page tables above the kernel. So we first map
+ * [kernel_end, end) to make memory above the kernel be mapped
+ * as soon as possible. And then use page tables allocated above
+ * the kernel to map [ISA_END_ADDRESS, kernel_end).
+ */
+ memory_map_bottom_up(kernel_end, end);
+ memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
+ } else {
+ memory_map_top_down(ISA_END_ADDRESS, end);
+ }
#ifdef CONFIG_X86_64
if (max_pfn > max_low_pfn) {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 4287f1ffba7..e39504878ae 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -665,7 +665,7 @@ void __init initmem_init(void)
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
sparse_memory_present_with_active_regions(0);
#ifdef CONFIG_FLATMEM
@@ -806,6 +806,9 @@ void __init mem_init(void)
BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
#undef high_memory
#undef __FIXADDR_TOP
+#ifdef CONFIG_RANDOMIZE_BASE
+ BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
+#endif
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 104d56a9245..df1a9927ad2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start,
#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
}
#endif
@@ -1055,8 +1055,8 @@ void __init mem_init(void)
after_bootmem = 1;
/* Register memory areas for /proc/kcore */
- kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
- VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
+ PAGE_SIZE, KCORE_OTHER);
mem_init_print_info(NULL);
}
@@ -1185,11 +1185,19 @@ int kern_addr_valid(unsigned long addr)
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
* not need special handling anymore:
*/
+static const char *gate_vma_name(struct vm_area_struct *vma)
+{
+ return "[vsyscall]";
+}
+static struct vm_operations_struct gate_vma_ops = {
+ .name = gate_vma_name,
+};
static struct vm_area_struct gate_vma = {
- .vm_start = VSYSCALL_START,
- .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+ .vm_start = VSYSCALL_ADDR,
+ .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
- .vm_flags = VM_READ | VM_EXEC
+ .vm_flags = VM_READ | VM_EXEC,
+ .vm_ops = &gate_vma_ops,
};
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
@@ -1218,29 +1226,46 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr)
*/
int in_gate_area_no_mm(unsigned long addr)
{
- return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+ return (addr & PAGE_MASK) == VSYSCALL_ADDR;
}
-const char *arch_vma_name(struct vm_area_struct *vma)
+static unsigned long probe_memory_block_size(void)
{
- if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
- return "[vdso]";
- if (vma == &gate_vma)
- return "[vsyscall]";
- return NULL;
-}
+ /* start from 2g */
+ unsigned long bz = 1UL<<31;
#ifdef CONFIG_X86_UV
-unsigned long memory_block_size_bytes(void)
-{
if (is_uv_system()) {
printk(KERN_INFO "UV: memory block size 2GB\n");
return 2UL * 1024 * 1024 * 1024;
}
- return MIN_MEMORY_BLOCK_SIZE;
-}
#endif
+ /* less than 64g installed */
+ if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
+ return MIN_MEMORY_BLOCK_SIZE;
+
+ /* get the tail size */
+ while (bz > MIN_MEMORY_BLOCK_SIZE) {
+ if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
+ break;
+ bz >>= 1;
+ }
+
+ printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+
+ return bz;
+}
+
+static unsigned long memory_block_size_probed;
+unsigned long memory_block_size_bytes(void)
+{
+ if (!memory_block_size_probed)
+ memory_block_size_probed = probe_memory_block_size();
+
+ return memory_block_size_probed;
+}
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
* Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 799580cabc7..baff1da354e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
+static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return 1;
+
+ WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
+
+ return 0;
+}
+
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
@@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
+ pfn = phys_addr >> PAGE_SHIFT;
last_pfn = last_addr >> PAGE_SHIFT;
- for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
- int is_ram = page_is_ram(pfn);
-
- if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
- return NULL;
- WARN_ON_ONCE(is_ram);
- }
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1)
+ return NULL;
/*
* Mappings have to be page-aligned
@@ -328,17 +340,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
return;
}
-static int __initdata early_ioremap_debug;
-
-static int __init early_ioremap_debug_setup(char *str)
-{
- early_ioremap_debug = 1;
-
- return 0;
-}
-early_param("early_ioremap_debug", early_ioremap_debug_setup);
-
-static __initdata int after_paging_init;
static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
@@ -362,18 +363,17 @@ bool __init is_early_ioremap_ptep(pte_t *ptep)
return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
}
-static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
-
void __init early_ioremap_init(void)
{
pmd_t *pmd;
- int i;
- if (early_ioremap_debug)
- printk(KERN_INFO "early_ioremap_init()\n");
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#else
+ WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#endif
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+ early_ioremap_setup();
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
@@ -402,13 +402,8 @@ void __init early_ioremap_init(void)
}
}
-void __init early_ioremap_reset(void)
-{
- after_paging_init = 1;
-}
-
-static void __init __early_set_fixmap(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t flags)
+void __init __early_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
{
unsigned long addr = __fix_to_virt(idx);
pte_t *pte;
@@ -425,198 +420,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
pte_clear(&init_mm, addr, pte);
__flush_tlb_one(addr);
}
-
-static inline void __init early_set_fixmap(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t prot)
-{
- if (after_paging_init)
- __set_fixmap(idx, phys, prot);
- else
- __early_set_fixmap(idx, phys, prot);
-}
-
-static inline void __init early_clear_fixmap(enum fixed_addresses idx)
-{
- if (after_paging_init)
- clear_fixmap(idx);
- else
- __early_set_fixmap(idx, 0, __pgprot(0));
-}
-
-static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
-static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
-
-void __init fixup_early_ioremap(void)
-{
- int i;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (prev_map[i]) {
- WARN_ON(1);
- break;
- }
- }
-
- early_ioremap_init();
-}
-
-static int __init check_early_ioremap_leak(void)
-{
- int count = 0;
- int i;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- if (prev_map[i])
- count++;
-
- if (!count)
- return 0;
- WARN(1, KERN_WARNING
- "Debug warning: early ioremap leak of %d areas detected.\n",
- count);
- printk(KERN_WARNING
- "please boot with early_ioremap_debug and report the dmesg.\n");
-
- return 1;
-}
-late_initcall(check_early_ioremap_leak);
-
-static void __init __iomem *
-__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
-{
- unsigned long offset;
- resource_size_t last_addr;
- unsigned int nrpages;
- enum fixed_addresses idx;
- int i, slot;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- slot = -1;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (!prev_map[i]) {
- slot = i;
- break;
- }
- }
-
- if (slot < 0) {
- printk(KERN_INFO "%s(%08llx, %08lx) not found slot\n",
- __func__, (u64)phys_addr, size);
- WARN_ON(1);
- return NULL;
- }
-
- if (early_ioremap_debug) {
- printk(KERN_INFO "%s(%08llx, %08lx) [%d] => ",
- __func__, (u64)phys_addr, size, slot);
- dump_stack();
- }
-
- /* Don't allow wraparound or zero size */
- last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr) {
- WARN_ON(1);
- return NULL;
- }
-
- prev_size[slot] = size;
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr + 1) - phys_addr;
-
- /*
- * Mappings have to fit in the FIX_BTMAP area.
- */
- nrpages = size >> PAGE_SHIFT;
- if (nrpages > NR_FIX_BTMAPS) {
- WARN_ON(1);
- return NULL;
- }
-
- /*
- * Ok, go for it..
- */
- idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
- while (nrpages > 0) {
- early_set_fixmap(idx, phys_addr, prot);
- phys_addr += PAGE_SIZE;
- --idx;
- --nrpages;
- }
- if (early_ioremap_debug)
- printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
-
- prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
- return prev_map[slot];
-}
-
-/* Remap an IO device */
-void __init __iomem *
-early_ioremap(resource_size_t phys_addr, unsigned long size)
-{
- return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
-}
-
-/* Remap memory */
-void __init __iomem *
-early_memremap(resource_size_t phys_addr, unsigned long size)
-{
- return __early_ioremap(phys_addr, size, PAGE_KERNEL);
-}
-
-void __init early_iounmap(void __iomem *addr, unsigned long size)
-{
- unsigned long virt_addr;
- unsigned long offset;
- unsigned int nrpages;
- enum fixed_addresses idx;
- int i, slot;
-
- slot = -1;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (prev_map[i] == addr) {
- slot = i;
- break;
- }
- }
-
- if (slot < 0) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
- addr, size);
- WARN_ON(1);
- return;
- }
-
- if (prev_size[slot] != size) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
- addr, size, slot, prev_size[slot]);
- WARN_ON(1);
- return;
- }
-
- if (early_ioremap_debug) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
- size, slot);
- dump_stack();
- }
-
- virt_addr = (unsigned long)addr;
- if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
- WARN_ON(1);
- return;
- }
- offset = virt_addr & ~PAGE_MASK;
- nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
-
- idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
- while (nrpages > 0) {
- early_clear_fixmap(idx);
- --idx;
- --nrpages;
- }
- prev_map[slot] = NULL;
-}
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index d87dd6d042d..dd89a13f105 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -78,10 +78,16 @@ early_initcall(kmemcheck_init);
*/
static int __init param_kmemcheck(char *str)
{
+ int val;
+ int ret;
+
if (!str)
return -EINVAL;
- sscanf(str, "%d", &kmemcheck_enabled);
+ ret = kstrtoint(str, 0, &val);
+ if (ret)
+ return ret;
+ kmemcheck_enabled = val;
return 0;
}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index e5d5e2ce9f7..637ab34ed63 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -11,7 +11,6 @@
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/hash.h>
-#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 8dabbed409e..1e9da795767 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
u64 i;
phys_addr_t this_start, this_end;
- for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
this_start = clamp_t(phys_addr_t, this_start, start, end);
this_end = clamp_t(phys_addr_t, this_end, start, end);
if (this_start < this_end) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8bf93bae1f1..a32b706c401 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -211,9 +211,13 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
*/
nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
- pr_err("Cannot find %zu bytes in node %d\n",
- nd_size, nid);
- return;
+ nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
+ MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (!nd_pa) {
+ pr_err("Cannot find %zu bytes in node %d\n",
+ nd_size, nid);
+ return;
+ }
}
nd = __va(nd_pa);
@@ -487,7 +491,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *mb = &mi->blk[i];
- memblock_set_node(mb->start, mb->end - mb->start, mb->nid);
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.memory, mb->nid);
}
/*
@@ -549,6 +554,41 @@ static void __init numa_init_array(void)
}
}
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+ int i, nid;
+ nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
+ unsigned long start, end;
+ struct memblock_region *r;
+
+ /*
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel. Set the nid in memblock.reserved will
+ * mark out all the nodes the kernel resides in.
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = &numa_meminfo.blk[i];
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.reserved, mb->nid);
+ }
+
+ /* Mark all kernel nodes. */
+ for_each_memblock(reserved, r)
+ node_set(r->nid, numa_kernel_nodes);
+
+ /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ nid = numa_meminfo.blk[i].nid;
+ if (!node_isset(nid, numa_kernel_nodes))
+ continue;
+
+ start = numa_meminfo.blk[i].start;
+ end = numa_meminfo.blk[i].end;
+
+ memblock_clear_hotplug(start, end - start);
+ }
+}
+
static int __init numa_init(int (*init_func)(void))
{
int i;
@@ -561,12 +601,28 @@ static int __init numa_init(int (*init_func)(void))
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
- WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
+ MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
+ MAX_NUMNODES));
+ /* In case that parsing SRAT failed. */
+ WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
numa_reset_distance();
ret = init_func();
if (ret < 0)
return ret;
+
+ /*
+ * We reset memblock back to the top-down direction
+ * here because if we configured ACPI_NUMA, we have
+ * parsed SRAT in init_func(). It is ok to have the
+ * reset here even if we did't configure ACPI_NUMA
+ * or acpi numa init fails and fallbacks to dummy
+ * numa init.
+ */
+ memblock_set_bottom_up(false);
+
ret = numa_cleanup_meminfo(&numa_meminfo);
if (ret < 0)
return ret;
@@ -586,6 +642,16 @@ static int __init numa_init(int (*init_func)(void))
numa_clear_node(i);
}
numa_init_array();
+
+ /*
+ * At very early time, the kernel have to use some memory such as
+ * loading the kernel image. We cannot prevent this anyway. So any
+ * node the kernel resides in should be un-hotpluggable.
+ *
+ * And when we come here, numa_init() won't fail.
+ */
+ numa_clear_kernel_node_hotplug();
+
return 0;
}
@@ -621,10 +687,6 @@ static int __init dummy_numa_init(void)
void __init x86_numa_init(void)
{
if (!numa_off) {
-#ifdef CONFIG_X86_NUMAQ
- if (!numa_init(numaq_numa_init))
- return;
-#endif
#ifdef CONFIG_ACPI_NUMA
if (!numa_init(x86_acpi_numa_init))
return;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 0342d27ca79..47b6436e41c 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
+ start = round_down(start, PAGES_PER_SECTION);
+ end = round_up(end, PAGES_PER_SECTION);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
physnode_map[pfn / PAGES_PER_SECTION] = nid;
printk(KERN_CONT "%lx ", pfn);
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d0b1773d9d2..6629f397b46 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -8,7 +8,6 @@
#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/mm.h>
#include <asm/cacheflush.h>
@@ -36,7 +35,7 @@ enum {
static int pte_testbit(pte_t pte)
{
- return pte_flags(pte) & _PAGE_UNUSED1;
+ return pte_flags(pte) & _PAGE_SOFTW1;
}
struct split_state {
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bb32480c2d7..ae242a7c11c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -30,6 +30,7 @@
*/
struct cpa_data {
unsigned long *vaddr;
+ pgd_t *pgd;
pgprot_t mask_set;
pgprot_t mask_clr;
int numpages;
@@ -125,8 +126,8 @@ within(unsigned long addr, unsigned long start, unsigned long end)
* @vaddr: virtual start address
* @size: number of bytes to flush
*
- * clflush is an unordered instruction which needs fencing with mfence
- * to avoid ordering issues.
+ * clflushopt is an unordered instruction which needs fencing with mfence or
+ * sfence to avoid ordering issues.
*/
void clflush_cache_range(void *vaddr, unsigned int size)
{
@@ -135,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size)
mb();
for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
- clflush(vaddr);
+ clflushopt(vaddr);
/*
* Flush any possible final partial cacheline:
*/
- clflush(vend);
+ clflushopt(vend);
mb();
}
@@ -323,16 +324,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
}
/*
- * Lookup the page table entry for a virtual address. Return a pointer
- * to the entry and the level of the mapping.
- *
- * Note: We return pud and pmd either when the entry is marked large
- * or when the present bit is not set. Otherwise we would return a
- * pointer to a nonexisting mapping.
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
*/
-pte_t *lookup_address(unsigned long address, unsigned int *level)
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
{
- pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
pmd_t *pmd;
@@ -361,8 +358,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
return pte_offset_kernel(pmd, address);
}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+ return lookup_address_in_pgd(pgd_offset_k(address), address, level);
+}
EXPORT_SYMBOL_GPL(lookup_address);
+static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
+ unsigned int *level)
+{
+ if (cpa->pgd)
+ return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
+ address, level);
+
+ return lookup_address(address, level);
+}
+
/*
* This is necessary because __pa() does not work on some
* kinds of memory, like vmalloc() or the alloc_remap()
@@ -437,7 +457,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = lookup_address(address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte)
goto out_unlock;
@@ -543,7 +563,8 @@ out_unlock:
}
static int
-__split_large_page(pte_t *kpte, unsigned long address, struct page *base)
+__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
+ struct page *base)
{
pte_t *pbase = (pte_t *)page_address(base);
unsigned long pfn, pfninc = 1;
@@ -556,7 +577,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = lookup_address(address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
@@ -632,7 +653,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
return 0;
}
-static int split_large_page(pte_t *kpte, unsigned long address)
+static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+ unsigned long address)
{
struct page *base;
@@ -644,15 +666,402 @@ static int split_large_page(pte_t *kpte, unsigned long address)
if (!base)
return -ENOMEM;
- if (__split_large_page(kpte, address, base))
+ if (__split_large_page(cpa, kpte, address, base))
__free_page(base);
return 0;
}
+static bool try_to_free_pte_page(pte_t *pte)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ if (!pte_none(pte[i]))
+ return false;
+
+ free_page((unsigned long)pte);
+ return true;
+}
+
+static bool try_to_free_pmd_page(pmd_t *pmd)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ if (!pmd_none(pmd[i]))
+ return false;
+
+ free_page((unsigned long)pmd);
+ return true;
+}
+
+static bool try_to_free_pud_page(pud_t *pud)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ if (!pud_none(pud[i]))
+ return false;
+
+ free_page((unsigned long)pud);
+ return true;
+}
+
+static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+{
+ pte_t *pte = pte_offset_kernel(pmd, start);
+
+ while (start < end) {
+ set_pte(pte, __pte(0));
+
+ start += PAGE_SIZE;
+ pte++;
+ }
+
+ if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+ pmd_clear(pmd);
+ return true;
+ }
+ return false;
+}
+
+static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+ unsigned long start, unsigned long end)
+{
+ if (unmap_pte_range(pmd, start, end))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, start);
+
+ /*
+ * Not on a 2MB page boundary?
+ */
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ __unmap_pmd_range(pud, pmd, start, pre_end);
+
+ start = pre_end;
+ pmd++;
+ }
+
+ /*
+ * Try to unmap in 2M chunks.
+ */
+ while (end - start >= PMD_SIZE) {
+ if (pmd_large(*pmd))
+ pmd_clear(pmd);
+ else
+ __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+
+ start += PMD_SIZE;
+ pmd++;
+ }
+
+ /*
+ * 4K leftovers?
+ */
+ if (start < end)
+ return __unmap_pmd_range(pud, pmd, start, end);
+
+ /*
+ * Try again to free the PMD page if haven't succeeded above.
+ */
+ if (!pud_none(*pud))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ pud_t *pud = pud_offset(pgd, start);
+
+ /*
+ * Not on a GB page boundary?
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ unmap_pmd_range(pud, start, pre_end);
+
+ start = pre_end;
+ pud++;
+ }
+
+ /*
+ * Try to unmap in 1G chunks?
+ */
+ while (end - start >= PUD_SIZE) {
+
+ if (pud_large(*pud))
+ pud_clear(pud);
+ else
+ unmap_pmd_range(pud, start, start + PUD_SIZE);
+
+ start += PUD_SIZE;
+ pud++;
+ }
+
+ /*
+ * 2M leftovers?
+ */
+ if (start < end)
+ unmap_pmd_range(pud, start, end);
+
+ /*
+ * No need to try to free the PUD page because we'll free it in
+ * populate_pgd's error path
+ */
+}
+
+static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
+{
+ pgd_t *pgd_entry = root + pgd_index(addr);
+
+ unmap_pud_range(pgd_entry, addr, end);
+
+ if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
+ pgd_clear(pgd_entry);
+}
+
+static int alloc_pte_page(pmd_t *pmd)
+{
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pte)
+ return -1;
+
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ return 0;
+}
+
+static int alloc_pmd_page(pud_t *pud)
+{
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pmd)
+ return -1;
+
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ return 0;
+}
+
+static void populate_pte(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, start);
+
+ while (num_pages-- && start < end) {
+
+ /* deal with the NX bit */
+ if (!(pgprot_val(pgprot) & _PAGE_NX))
+ cpa->pfn &= ~_PAGE_NX;
+
+ set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+
+ start += PAGE_SIZE;
+ cpa->pfn += PAGE_SIZE;
+ pte++;
+ }
+}
+
+static int populate_pmd(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+{
+ unsigned int cur_pages = 0;
+ pmd_t *pmd;
+
+ /*
+ * Not on a 2M boundary?
+ */
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+
+ pre_end = min_t(unsigned long, pre_end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(unsigned int, num_pages, cur_pages);
+
+ /*
+ * Need a PTE page?
+ */
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
+
+ start = pre_end;
+ }
+
+ /*
+ * We mapped them all?
+ */
+ if (num_pages == cur_pages)
+ return cur_pages;
+
+ while (end - start >= PMD_SIZE) {
+
+ /*
+ * We cannot use a 1G page so allocate a PMD page if needed.
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ pmd = pmd_offset(pud, start);
+
+ set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+ start += PMD_SIZE;
+ cpa->pfn += PMD_SIZE;
+ cur_pages += PMD_SIZE >> PAGE_SHIFT;
+ }
+
+ /*
+ * Map trailing 4K pages.
+ */
+ if (start < end) {
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, end, num_pages - cur_pages,
+ pmd, pgprot);
+ }
+ return num_pages;
+}
+
+static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
+ pgprot_t pgprot)
+{
+ pud_t *pud;
+ unsigned long end;
+ int cur_pages = 0;
+
+ end = start + (cpa->numpages << PAGE_SHIFT);
+
+ /*
+ * Not on a Gb page boundary? => map everything up to it with
+ * smaller pages.
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long pre_end;
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+
+ pre_end = min_t(unsigned long, end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
+
+ pud = pud_offset(pgd, start);
+
+ /*
+ * Need a PMD page?
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
+ pud, pgprot);
+ if (cur_pages < 0)
+ return cur_pages;
+
+ start = pre_end;
+ }
+
+ /* We mapped them all? */
+ if (cpa->numpages == cur_pages)
+ return cur_pages;
+
+ pud = pud_offset(pgd, start);
+
+ /*
+ * Map everything starting from the Gb boundary, possibly with 1G pages
+ */
+ while (end - start >= PUD_SIZE) {
+ set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+ start += PUD_SIZE;
+ cpa->pfn += PUD_SIZE;
+ cur_pages += PUD_SIZE >> PAGE_SHIFT;
+ pud++;
+ }
+
+ /* Map trailing leftover */
+ if (start < end) {
+ int tmp;
+
+ pud = pud_offset(pgd, start);
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
+ pud, pgprot);
+ if (tmp < 0)
+ return cur_pages;
+
+ cur_pages += tmp;
+ }
+ return cur_pages;
+}
+
+/*
+ * Restrictions for kernel page table do not necessarily apply when mapping in
+ * an alternate PGD.
+ */
+static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
+{
+ pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
+ pud_t *pud = NULL; /* shut up gcc */
+ pgd_t *pgd_entry;
+ int ret;
+
+ pgd_entry = cpa->pgd + pgd_index(addr);
+
+ /*
+ * Allocate a PUD page and hand it down for mapping.
+ */
+ if (pgd_none(*pgd_entry)) {
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pud)
+ return -1;
+
+ set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
+
+ ret = populate_pud(cpa, addr, pgd_entry, pgprot);
+ if (ret < 0) {
+ unmap_pgd_range(cpa->pgd, addr,
+ addr + (cpa->numpages << PAGE_SHIFT));
+ return ret;
+ }
+
+ cpa->numpages = ret;
+ return 0;
+}
+
static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
int primary)
{
+ if (cpa->pgd)
+ return populate_pgd(cpa, vaddr);
+
/*
* Ignore all non primary paths.
*/
@@ -697,7 +1106,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
else
address = *cpa->vaddr;
repeat:
- kpte = lookup_address(address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -761,7 +1170,7 @@ repeat:
/*
* We have to split the large page:
*/
- err = split_large_page(kpte, address);
+ err = split_large_page(cpa, kpte, address);
if (!err) {
/*
* Do a global flush tlb after splitting the large page
@@ -910,6 +1319,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
int ret, cache, checkalias;
unsigned long baddr = 0;
+ memset(&cpa, 0, sizeof(cpa));
+
/*
* Check, if we are requested to change a not supported
* feature:
@@ -982,10 +1393,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cache = cache_attr(mask_set);
/*
- * On success we use clflush, when the CPU supports it to
- * avoid the wbindv. If the CPU does not support it and in the
+ * On success we use CLFLUSH, when the CPU supports it to
+ * avoid the WBINVD. If the CPU does not support it and in the
* error case we fall back to cpa_flush_all (which uses
- * wbindv):
+ * WBINVD):
*/
if (!ret && cpu_has_clflush) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
@@ -1356,6 +1767,7 @@ static int __set_pages_p(struct page *page, int numpages)
{
unsigned long tempaddr = (unsigned long) page_address(page);
struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.mask_clr = __pgprot(0),
@@ -1374,6 +1786,7 @@ static int __set_pages_np(struct page *page, int numpages)
{
unsigned long tempaddr = (unsigned long) page_address(page);
struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
@@ -1434,6 +1847,42 @@ bool kernel_page_present(struct page *page)
#endif /* CONFIG_DEBUG_PAGEALLOC */
+int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+ unsigned numpages, unsigned long page_flags)
+{
+ int retval = -EINVAL;
+
+ struct cpa_data cpa = {
+ .vaddr = &address,
+ .pfn = pfn,
+ .pgd = pgd,
+ .numpages = numpages,
+ .mask_set = __pgprot(0),
+ .mask_clr = __pgprot(0),
+ .flags = 0,
+ };
+
+ if (!(__supported_pte_mask & _PAGE_NX))
+ goto out;
+
+ if (!(page_flags & _PAGE_NX))
+ cpa.mask_clr = __pgprot(_PAGE_NX);
+
+ cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
+
+ retval = __change_page_attr_set_clr(&cpa, 0);
+ __flush_tlb_all();
+
+out:
+ return retval;
+}
+
+void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
+ unsigned numpages)
+{
+ unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
+}
+
/*
* The testcases use internal knowledge of the implementation that shouldn't
* be exposed to the rest of the kernel. Include these directly here.
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index dfa537a03be..6fb6927f9e7 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
struct page *pte;
pte = alloc_pages(__userpte_alloc_gfp, 0);
- if (pte)
- pgtable_page_ctor(pte);
+ if (!pte)
+ return NULL;
+ if (!pgtable_page_ctor(pte)) {
+ __free_page(pte);
+ return NULL;
+ }
return pte;
}
@@ -57,6 +61,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
#if PAGETABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
+ struct page *page = virt_to_page(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
/*
* NOTE! For PAE, any changes to the top page-directory-pointer-table
@@ -65,7 +70,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
- tlb_remove_page(tlb, virt_to_page(pmd));
+ pgtable_pmd_page_dtor(page);
+ tlb_remove_page(tlb, page);
}
#if PAGETABLE_LEVELS > 3
@@ -189,8 +195,10 @@ static void free_pmds(pmd_t *pmds[])
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++)
- if (pmds[i])
+ if (pmds[i]) {
+ pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
+ }
}
static int preallocate_pmds(pmd_t *pmds[])
@@ -200,8 +208,13 @@ static int preallocate_pmds(pmd_t *pmds[])
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
- if (pmd == NULL)
+ if (!pmd)
+ failed = true;
+ if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
+ free_page((unsigned long)pmd);
+ pmd = NULL;
failed = true;
+ }
pmds[i] = pmd;
}
@@ -386,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
- int young;
-
- young = ptep_test_and_clear_young(vma, address, ptep);
- if (young)
- flush_tlb_page(vma, address);
-
- return young;
+ /*
+ * On x86 CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+ return ptep_test_and_clear_young(vma, address, ptep);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -436,9 +456,9 @@ void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
BUG_ON(fixmaps_set > 0);
- printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
- (int)-reserve);
- __FIXADDR_TOP = -reserve - PAGE_SIZE;
+ __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
+ printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
+ -reserve, __FIXADDR_TOP + PAGE_SIZE);
#endif
}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index a69bcb8c762..4dd8cf65257 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -127,7 +127,7 @@ static int __init parse_reservetop(char *arg)
address = memparse(arg, &arg);
reserve_top_address(address);
- fixup_early_ioremap();
+ early_ioremap_init();
return 0;
}
early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 266ca912f62..66338a60aa6 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -42,15 +42,31 @@ static __init inline int srat_disabled(void)
return acpi_numa < 0;
}
-/* Callback for SLIT parsing */
+/*
+ * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
+ * I/O localities since SRAT does not list them. I/O localities are
+ * not supported at this point.
+ */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
int i, j;
- for (i = 0; i < slit->locality_count; i++)
- for (j = 0; j < slit->locality_count; j++)
- numa_set_distance(pxm_to_node(i), pxm_to_node(j),
+ for (i = 0; i < slit->locality_count; i++) {
+ const int from_node = pxm_to_node(i);
+
+ if (from_node == NUMA_NO_NODE)
+ continue;
+
+ for (j = 0; j < slit->locality_count; j++) {
+ const int to_node = pxm_to_node(j);
+
+ if (to_node == NUMA_NO_NODE)
+ continue;
+
+ numa_set_distance(from_node, to_node,
slit->entry[slit->locality_count * i + j]);
+ }
+ }
}
/* Callback for Proximity Domain -> x2APIC mapping */
@@ -181,6 +197,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
(unsigned long long) start, (unsigned long long) end - 1,
hotpluggable ? " hotplug" : "");
+ /* Mark hotplug range in memblock. */
+ if (hotpluggable && memblock_mark_hotplug(start, ma->length))
+ pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
+ (unsigned long long)start, (unsigned long long)end - 1);
+
return 0;
out_err_bad_srat:
bad_srat();
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ae699b3bbac..dd8dda167a2 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_end == TLB_FLUSH_ALL)
local_flush_tlb();
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_start = start;
info.flush_end = end;
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (is_uv_system()) {
unsigned int cpu;
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
preempt_disable();
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
-/*
- * It can find out the THP large page, or
- * HUGETLB page in tlb_flush when THP disabled
- */
-static inline unsigned long has_large_page(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr = ALIGN(start, HPAGE_SIZE);
- for (; addr < end; addr += HPAGE_SIZE) {
- pgd = pgd_offset(mm, addr);
- if (likely(!pgd_none(*pgd))) {
- pud = pud_offset(pgd, addr);
- if (likely(!pud_none(*pud))) {
- pmd = pmd_offset(pud, addr);
- if (likely(!pmd_none(*pmd)))
- if (pmd_large(*pmd))
- return addr;
- }
- }
- }
- return 0;
-}
-
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
unsigned long addr;
unsigned act_entries, tlb_entries = 0;
+ unsigned long nr_base_pages;
preempt_disable();
if (current->active_mm != mm)
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
tlb_entries = tlb_lli_4k[ENTRIES];
else
tlb_entries = tlb_lld_4k[ENTRIES];
+
/* Assume all of TLB entries was occupied by this task */
- act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
+ act_entries = tlb_entries >> tlb_flushall_shift;
+ act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
+ nr_base_pages = (end - start) >> PAGE_SHIFT;
/* tlb_flushall_shift is on balance point, details in commit log */
- if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ if (nr_base_pages > act_entries) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
- if (has_large_page(mm, start, end)) {
- local_flush_tlb();
- goto flush_all;
- }
/* flush range by one by one 'invlpg' */
for (addr = start; addr < end; addr += PAGE_SIZE) {
- count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
static void do_flush_tlb_all(void *info)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
- count_vm_event(NR_TLB_REMOTE_FLUSH);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}