aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/amdtopology.c3
-rw-r--r--arch/x86/mm/dump_pagetables.c118
-rw-r--r--arch/x86/mm/extable.c142
-rw-r--r--arch/x86/mm/fault.c268
-rw-r--r--arch/x86/mm/gup.c21
-rw-r--r--arch/x86/mm/highmem_32.c11
-rw-r--r--arch/x86/mm/hugetlbpage.c320
-rw-r--r--arch/x86/mm/init.c595
-rw-r--r--arch/x86/mm/init_32.c182
-rw-r--r--arch/x86/mm/init_64.c822
-rw-r--r--arch/x86/mm/ioremap.c266
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c8
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c1
-rw-r--r--arch/x86/mm/kmmio.c1
-rw-r--r--arch/x86/mm/memtest.c12
-rw-r--r--arch/x86/mm/mm_internal.h19
-rw-r--r--arch/x86/mm/mmap.c8
-rw-r--r--arch/x86/mm/mmio-mod.c4
-rw-r--r--arch/x86/mm/numa.c156
-rw-r--r--arch/x86/mm/numa_32.c165
-rw-r--r--arch/x86/mm/numa_64.c13
-rw-r--r--arch/x86/mm/numa_emulation.c28
-rw-r--r--arch/x86/mm/numa_internal.h6
-rw-r--r--arch/x86/mm/pageattr-test.c10
-rw-r--r--arch/x86/mm/pageattr.c653
-rw-r--r--arch/x86/mm/pat.c196
-rw-r--r--arch/x86/mm/pat_rbtree.c34
-rw-r--r--arch/x86/mm/pgtable.c78
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/mm/physaddr.c60
-rw-r--r--arch/x86/mm/setup_nx.c4
-rw-r--r--arch/x86/mm/srat.c73
-rw-r--r--arch/x86/mm/tlb.c395
34 files changed, 2866 insertions, 1811 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 23d8e5fecf7..6a19ad9f370 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
obj-$(CONFIG_SMP) += tlb.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 5247d01329c..2ca15b59fb3 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -130,9 +130,8 @@ int __init amd_numa_init(void)
}
limit >>= 16;
- limit <<= 24;
- limit |= (1<<24)-1;
limit++;
+ limit <<= 24;
if (limit > end)
limit = end;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0002a3a3308..167ffcac16e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,11 +30,14 @@ struct pg_state {
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
+ unsigned long lines;
+ bool to_dmesg;
};
struct addr_marker {
unsigned long start_address;
const char *name;
+ unsigned long max_lines;
};
/* indices for address_markers; keep sync'd w/ address_markers below */
@@ -45,6 +48,7 @@ enum address_markers_idx {
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
+ ESPFIX_START_NR,
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
@@ -67,6 +71,7 @@ static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
@@ -88,10 +93,28 @@ static struct addr_marker address_markers[] = {
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_INFO fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_CONT fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
/*
* Print a readable form of a pgprot_t to the seq_file
*/
-static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
{
pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
@@ -99,47 +122,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
if (!pgprot_val(prot)) {
/* Not present */
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
} else {
if (pr & _PAGE_USER)
- seq_printf(m, "USR ");
+ pt_dump_cont_printf(m, dmsg, "USR ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_RW)
- seq_printf(m, "RW ");
+ pt_dump_cont_printf(m, dmsg, "RW ");
else
- seq_printf(m, "ro ");
+ pt_dump_cont_printf(m, dmsg, "ro ");
if (pr & _PAGE_PWT)
- seq_printf(m, "PWT ");
+ pt_dump_cont_printf(m, dmsg, "PWT ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_PCD)
- seq_printf(m, "PCD ");
+ pt_dump_cont_printf(m, dmsg, "PCD ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
/* Bit 9 has a different meaning on level 3 vs 4 */
if (level <= 3) {
if (pr & _PAGE_PSE)
- seq_printf(m, "PSE ");
+ pt_dump_cont_printf(m, dmsg, "PSE ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
} else {
if (pr & _PAGE_PAT)
- seq_printf(m, "pat ");
+ pt_dump_cont_printf(m, dmsg, "pat ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
}
if (pr & _PAGE_GLOBAL)
- seq_printf(m, "GLB ");
+ pt_dump_cont_printf(m, dmsg, "GLB ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_NX)
- seq_printf(m, "NX ");
+ pt_dump_cont_printf(m, dmsg, "NX ");
else
- seq_printf(m, "x ");
+ pt_dump_cont_printf(m, dmsg, "x ");
}
- seq_printf(m, "%s\n", level_name[level]);
+ pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
}
/*
@@ -163,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
pgprot_t new_prot, int level)
{
pgprotval_t prot, cur;
- static const char units[] = "KMGTPE";
+ static const char units[] = "BKMGTPE";
/*
* If we have a "break" in the series, we need to flush the state that
@@ -178,7 +201,9 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
@@ -188,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st,
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
-
- delta = (st->current_address - st->start_address) >> 10;
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
+ if (!st->marker->max_lines ||
+ st->lines < st->marker->max_lines) {
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
+
+ delta = st->current_address - st->start_address;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
+ delta, *unit);
+ printk_prot(m, st->current_prot, st->level,
+ st->to_dmesg);
}
- seq_printf(m, "%9lu%c ", delta, *unit);
- printk_prot(m, st->current_prot, st->level);
+ st->lines++;
/*
* We print markers for special areas of address space,
@@ -206,8 +238,19 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* This helps in the interpretation.
*/
if (st->current_address >= st->marker[1].start_address) {
+ if (st->marker->max_lines &&
+ st->lines > st->marker->max_lines) {
+ unsigned long nskip =
+ st->lines - st->marker->max_lines;
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "... %lu entr%s skipped ... \n",
+ nskip,
+ nskip == 1 ? "y" : "ies");
+ }
st->marker++;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
}
st->start_address = st->current_address;
@@ -296,7 +339,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
-static void walk_pgd_level(struct seq_file *m)
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt;
@@ -304,9 +347,12 @@ static void walk_pgd_level(struct seq_file *m)
pgd_t *start = swapper_pg_dir;
#endif
int i;
- struct pg_state st;
+ struct pg_state st = {};
- memset(&st, 0, sizeof(st));
+ if (pgd) {
+ start = pgd;
+ st.to_dmesg = true;
+ }
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
@@ -331,7 +377,7 @@ static void walk_pgd_level(struct seq_file *m)
static int ptdump_show(struct seq_file *m, void *v)
{
- walk_pgd_level(m);
+ ptdump_walk_pgd_level(m, NULL);
return 0;
}
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 1fb85dbe390..903ec1e9c32 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,11 +1,23 @@
#include <linux/module.h>
#include <linux/spinlock.h>
+#include <linux/sort.h>
#include <asm/uaccess.h>
+static inline unsigned long
+ex_insn_addr(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->insn + x->insn;
+}
+static inline unsigned long
+ex_fixup_addr(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->fixup + x->fixup;
+}
int fixup_exception(struct pt_regs *regs)
{
const struct exception_table_entry *fixup;
+ unsigned long new_ip;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -23,15 +35,135 @@ int fixup_exception(struct pt_regs *regs)
fixup = search_exception_tables(regs->ip);
if (fixup) {
- /* If fixup is less than 16, it means uaccess error */
- if (fixup->fixup < 16) {
+ new_ip = ex_fixup_addr(fixup);
+
+ if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+ /* Special hack for uaccess_err */
current_thread_info()->uaccess_err = 1;
- regs->ip += fixup->fixup;
- return 1;
+ new_ip -= 0x7ffffff0;
}
- regs->ip = fixup->fixup;
+ regs->ip = new_ip;
return 1;
}
return 0;
}
+
+/* Restricted version used during very early boot */
+int __init early_fixup_exception(unsigned long *ip)
+{
+ const struct exception_table_entry *fixup;
+ unsigned long new_ip;
+
+ fixup = search_exception_tables(*ip);
+ if (fixup) {
+ new_ip = ex_fixup_addr(fixup);
+
+ if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+ /* uaccess handling not supported during early boot */
+ return 0;
+ }
+
+ *ip = new_ip;
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Search one exception table for an entry corresponding to the
+ * given instruction address, and return the address of the entry,
+ * or NULL if none is found.
+ * We use a binary search, and thus we assume that the table is
+ * already sorted.
+ */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+ const struct exception_table_entry *last,
+ unsigned long value)
+{
+ while (first <= last) {
+ const struct exception_table_entry *mid;
+ unsigned long addr;
+
+ mid = ((last - first) >> 1) + first;
+ addr = ex_insn_addr(mid);
+ if (addr < value)
+ first = mid + 1;
+ else if (addr > value)
+ last = mid - 1;
+ else
+ return mid;
+ }
+ return NULL;
+}
+
+/*
+ * The exception table needs to be sorted so that the binary
+ * search that we use to find entries in it works properly.
+ * This is used both for the kernel exception table and for
+ * the exception tables of modules that get loaded.
+ *
+ */
+static int cmp_ex(const void *a, const void *b)
+{
+ const struct exception_table_entry *x = a, *y = b;
+
+ /*
+ * This value will always end up fittin in an int, because on
+ * both i386 and x86-64 the kernel symbol-reachable address
+ * space is < 2 GiB.
+ *
+ * This compare is only valid after normalization.
+ */
+ return x->insn - y->insn;
+}
+
+void sort_extable(struct exception_table_entry *start,
+ struct exception_table_entry *finish)
+{
+ struct exception_table_entry *p;
+ int i;
+
+ /* Convert all entries to being relative to the start of the section */
+ i = 0;
+ for (p = start; p < finish; p++) {
+ p->insn += i;
+ i += 4;
+ p->fixup += i;
+ i += 4;
+ }
+
+ sort(start, finish - start, sizeof(struct exception_table_entry),
+ cmp_ex, NULL);
+
+ /* Denormalize all entries */
+ i = 0;
+ for (p = start; p < finish; p++) {
+ p->insn -= i;
+ i += 4;
+ p->fixup -= i;
+ i += 4;
+ }
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * If the exception table is sorted, any referring to the module init
+ * will be at the beginning or the end.
+ */
+void trim_init_extable(struct module *m)
+{
+ /*trim the beginning*/
+ while (m->num_exentries &&
+ within_module_init(ex_insn_addr(&m->extable[0]), m)) {
+ m->extable++;
+ m->num_exentries--;
+ }
+ /*trim the end*/
+ while (m->num_exentries &&
+ within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
+ m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f0b4caf85c1..36642793e31 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,16 +8,21 @@
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/module.h> /* search_exception_table */
#include <linux/bootmem.h> /* max_low_pfn */
-#include <linux/kprobes.h> /* __kprobes, ... */
+#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <linux/prefetch.h> /* prefetchw */
+#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
-#include <asm/fixmap.h> /* VSYSCALL_START */
+#include <asm/fixmap.h> /* VSYSCALL_ADDR */
+#include <asm/vsyscall.h> /* emulate_vsyscall */
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
/*
* Page fault error code bits:
@@ -41,7 +46,7 @@ enum x86_pf_error_code {
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
-static inline int __kprobes
+static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
if (unlikely(is_kmmio_active()))
@@ -50,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
return 0;
}
-static inline int __kprobes notify_page_fault(struct pt_regs *regs)
+static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
{
int ret = 0;
@@ -257,7 +262,7 @@ void vmalloc_sync_all(void)
*
* Handle a fault on the vmalloc or module mapping area
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
@@ -287,6 +292,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
/*
* Did it hit the DOS screen memory VA from vm86 mode?
@@ -354,7 +360,7 @@ void vmalloc_sync_all(void)
*
* This assumes no large pages in there.
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
pud_t *pud, *pud_ref;
@@ -377,10 +383,12 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd_ref))
return -1;
- if (pgd_none(*pgd))
+ if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
- else
+ arch_flush_lazy_mmu_mode();
+ } else {
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+ }
/*
* Below here mismatches are bugs because these lower tables
@@ -419,6 +427,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
@@ -554,7 +563,7 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
/*
* Pentium F0 0F C7 C8 bug workaround:
*/
- if (boot_cpu_data.f00f_bug) {
+ if (boot_cpu_has_bug(X86_BUG_F00F)) {
nr = (address - idt_descr.address) >> 3;
if (nr == 6) {
@@ -578,11 +587,16 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (error_code & PF_INSTR) {
unsigned int level;
+ pgd_t *pgd;
+ pte_t *pte;
- pte_t *pte = lookup_address(address, &level);
+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd += pgd_index(address);
+
+ pte = lookup_address_in_pgd(pgd, address, &level);
if (pte && pte_present(*pte) && !pte_exec(*pte))
- printk(nx_warning, current_uid());
+ printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
}
printk(KERN_ALERT "BUG: unable to handle kernel ");
@@ -593,7 +607,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(KERN_CONT " at %p\n", (void *) address);
printk(KERN_ALERT "IP:");
- printk_address(regs->ip, 1);
+ printk_address(regs->ip);
dump_pagetable(address);
}
@@ -615,7 +629,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
dump_pagetable(address);
tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
+ tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code;
if (__die("Bad pagetable", regs, error_code))
@@ -635,14 +649,32 @@ no_context(struct pt_regs *regs, unsigned long error_code,
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
if (current_thread_info()->sig_on_uaccess_error && signal) {
- tsk->thread.trap_no = 14;
+ tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code | PF_USER;
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
force_sig_info_fault(signal, si_code, address, tsk, 0);
}
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
return;
}
@@ -676,7 +708,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
+ tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code;
sig = SIGKILL;
@@ -742,19 +774,21 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* emulation.
*/
if (unlikely((error_code & PF_INSTR) &&
- ((address & ~0xfff) == VSYSCALL_START))) {
+ ((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
+ /* Kernel addresses are always protection faults: */
+ if (address >= TASK_SIZE)
+ error_code |= PF_PROT;
- if (unlikely(show_unhandled_signals))
+ if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
- /* Kernel addresses are always protection faults: */
tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
- tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_PF;
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
@@ -802,20 +836,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
__bad_area(regs, error_code, address, SEGV_ACCERR);
}
-/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
-static void
-out_of_memory(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
-{
- /*
- * We ran out of memory, call the OOM killer, and return the userspace
- * (which will retry the fault, or kill us if we got oom-killed):
- */
- up_read(&current->mm->mmap_sem);
-
- pagefault_out_of_memory();
-}
-
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
@@ -838,7 +858,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
tsk->thread.cr2 = address;
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 14;
+ tsk->thread.trap_nr = X86_TRAP_PF;
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
@@ -851,23 +871,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}
-static noinline int
+static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
- /*
- * Pagefault was interrupted by SIGKILL. We have no reason to
- * continue pagefault.
- */
- if (fatal_signal_pending(current)) {
- if (!(fault & VM_FAULT_RETRY))
- up_read(&current->mm->mmap_sem);
- if (!(error_code & PF_USER))
- no_context(regs, error_code, address, 0, 0);
- return 1;
+ if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ up_read(&current->mm->mmap_sem);
+ no_context(regs, error_code, address, 0, 0);
+ return;
}
- if (!(fault & VM_FAULT_ERROR))
- return 0;
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
@@ -875,10 +887,17 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
- return 1;
+ return;
}
- out_of_memory(regs, error_code, address);
+ up_read(&current->mm->mmap_sem);
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
@@ -886,7 +905,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
else
BUG();
}
- return 1;
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -912,7 +930,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static noinline __kprobes int
+static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
@@ -943,14 +961,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd);
- /*
- * Note: don't use pte_present() here, since it returns true
- * if the _PAGE_PROTNONE bit is set. However, this aliases the
- * _PAGE_GLOBAL bit, which for kernel pages give false positives
- * when CONFIG_DEBUG_PAGEALLOC is used.
- */
pte = pte_offset_kernel(pmd, address);
- if (!(pte_flags(*pte) & _PAGE_PRESENT))
+ if (!pte_present(*pte))
return 0;
ret = spurious_fault_check(error_code, pte);
@@ -966,6 +978,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
return ret;
}
+NOKPROBE_SYMBOL(spurious_fault);
int show_unhandled_signals = 1;
@@ -995,29 +1008,45 @@ static int fault_in_kernel_space(unsigned long address)
return address >= TASK_SIZE_MAX;
}
+static inline bool smap_violation(int error_code, struct pt_regs *regs)
+{
+ if (!IS_ENABLED(CONFIG_X86_SMAP))
+ return false;
+
+ if (!static_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
+ if (error_code & PF_USER)
+ return false;
+
+ if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC))
+ return false;
+
+ return true;
+}
+
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
+ *
+ * This function must have noinline because both callers
+ * {,trace_}do_page_fault() have notrace on. Having this an actual function
+ * guarantees there's a function trace entry.
*/
-dotraplinkage void __kprobes
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- unsigned long address;
struct mm_struct *mm;
int fault;
- int write = error_code & PF_WRITE;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
- (write ? FAULT_FLAG_WRITE : 0);
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
- /* Get the faulting address: */
- address = read_cr2();
-
/*
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
@@ -1056,7 +1085,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
return;
/* kprobes don't want to hook the spurious faults: */
- if (notify_page_fault(regs))
+ if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
@@ -1068,8 +1097,26 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
/* kprobes don't want to hook the spurious faults: */
- if (unlikely(notify_page_fault(regs)))
+ if (unlikely(kprobes_fault(regs)))
return;
+
+ if (unlikely(error_code & PF_RSVD))
+ pgtable_bad(regs, error_code, address);
+
+ if (unlikely(smap_violation(error_code, regs))) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * If we're in an interrupt, have no user context or are running
+ * in an atomic region then we must not take the fault:
+ */
+ if (unlikely(in_atomic() || !mm)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
@@ -1080,24 +1127,16 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (user_mode_vm(regs)) {
local_irq_enable();
error_code |= PF_USER;
+ flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
- if (unlikely(error_code & PF_RSVD))
- pgtable_bad(regs, error_code, address);
-
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- /*
- * If we're in an interrupt, have no user context or are running
- * in an atomic region then we must not take the fault:
- */
- if (unlikely(in_atomic() || !mm)) {
- bad_area_nosemaphore(regs, error_code, address);
- return;
- }
+ if (error_code & PF_WRITE)
+ flags |= FAULT_FLAG_WRITE;
/*
* When running in the kernel we expect faults to occur only to
@@ -1177,9 +1216,17 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, flags);
- if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
- if (mm_fault_error(regs, error_code, address, fault))
- return;
+ /*
+ * If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_sem because it
+ * would already be released in __lock_page_or_retry in mm/filemap.c.
+ */
+ if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
+ return;
+
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ mm_fault_error(regs, error_code, address, fault);
+ return;
}
/*
@@ -1201,6 +1248,7 @@ good_area:
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
* of starvation. */
flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
goto retry;
}
}
@@ -1209,3 +1257,55 @@ good_area:
up_read(&mm->mmap_sem);
}
+NOKPROBE_SYMBOL(__do_page_fault);
+
+dotraplinkage void notrace
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ unsigned long address = read_cr2(); /* Get the faulting address */
+ enum ctx_state prev_state;
+
+ /*
+ * We must have this function tagged with __kprobes, notrace and call
+ * read_cr2() before calling anything else. To avoid calling any kind
+ * of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contain all sorts of tracepoints.
+ */
+
+ prev_state = exception_enter();
+ __do_page_fault(regs, error_code, address);
+ exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(do_page_fault);
+
+#ifdef CONFIG_TRACING
+static nokprobe_inline void
+trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_page_fault_user(address, regs, error_code);
+ else
+ trace_page_fault_kernel(address, regs, error_code);
+}
+
+dotraplinkage void notrace
+trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ /*
+ * The exception_enter and tracepoint processing could
+ * trigger another page faults (user space callchain
+ * reading) and destroy the original cr2 value, so read
+ * the faulting address now.
+ */
+ unsigned long address = read_cr2();
+ enum ctx_state prev_state;
+
+ prev_state = exception_enter();
+ trace_page_fault_entries(address, regs, error_code);
+ __do_page_fault(regs, error_code, address);
+ exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(trace_do_page_fault);
+#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dd74e46828c..207d9aef662 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_t pte = gup_get_pte(ptep);
struct page *page;
+ /* Similar to the PMD case, NUMA hinting must take slow path */
+ if (pte_numa(pte)) {
+ pte_unmap(ptep);
+ return 0;
+ }
+
if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
@@ -102,8 +108,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
static inline void get_head_page_multiple(struct page *page, int nr)
{
- VM_BUG_ON(page != compound_head(page));
- VM_BUG_ON(page_count(page) == 0);
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
+ VM_BUG_ON_PAGE(page_count(page) == 0, page);
atomic_add(nr, &page->_count);
SetPageReferenced(page);
}
@@ -129,7 +135,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON(compound_head(page) != head);
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
if (PageTail(page))
get_huge_page_tail(page);
@@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_numa(pmd))
+ return 0;
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
return 0;
} else {
@@ -199,7 +212,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON(compound_head(page) != head);
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
if (PageTail(page))
get_huge_page_tail(page);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index f4f29b19fac..4500142bc4a 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,6 +1,7 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/swap.h> /* for totalram_pages */
+#include <linux/bootmem.h>
void *kmap(struct page *page)
{
@@ -51,11 +52,11 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
}
EXPORT_SYMBOL(kmap_atomic_prot);
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
{
return kmap_atomic_prot(page, kmap_prot);
}
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
/*
* This is the same as kmap_atomic() but can map memory that doesn't
@@ -121,6 +122,11 @@ void __init set_highmem_pages_init(void)
struct zone *zone;
int nid;
+ /*
+ * Explicitly reset zone->managed_pages because set_highmem_pages_init()
+ * is invoked before free_all_bootmem()
+ */
+ reset_all_zones_managed_pages();
for_each_zone(zone) {
unsigned long zone_start_pfn, zone_end_pfn;
@@ -137,5 +143,4 @@ void __init set_highmem_pages_init(void)
add_highpages_with_active_regions(nid, zone_start_pfn,
zone_end_pfn);
}
- totalram_pages += totalhigh_pages;
}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8ecbb4bba4b..8b977ebf938 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -16,159 +16,6 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
-static unsigned long page_table_shareable(struct vm_area_struct *svma,
- struct vm_area_struct *vma,
- unsigned long addr, pgoff_t idx)
-{
- unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
- svma->vm_start;
- unsigned long sbase = saddr & PUD_MASK;
- unsigned long s_end = sbase + PUD_SIZE;
-
- /* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
-
- /*
- * match the virtual addresses, permission and the alignment of the
- * page table page.
- */
- if (pmd_index(addr) != pmd_index(saddr) ||
- vm_flags != svm_flags ||
- sbase < svma->vm_start || svma->vm_end < s_end)
- return 0;
-
- return saddr;
-}
-
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
- unsigned long base = addr & PUD_MASK;
- unsigned long end = base + PUD_SIZE;
-
- /*
- * check on proper vm_flags and page table alignment
- */
- if (vma->vm_flags & VM_MAYSHARE &&
- vma->vm_start <= base && end <= vma->vm_end)
- return 1;
- return 0;
-}
-
-/*
- * search for a shareable pmd page for hugetlb.
- */
-static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
-{
- struct vm_area_struct *vma = find_vma(mm, addr);
- struct address_space *mapping = vma->vm_file->f_mapping;
- pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- struct prio_tree_iter iter;
- struct vm_area_struct *svma;
- unsigned long saddr;
- pte_t *spte = NULL;
-
- if (!vma_shareable(vma, addr))
- return;
-
- mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
- if (svma == vma)
- continue;
-
- saddr = page_table_shareable(svma, vma, addr, idx);
- if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr);
- if (spte) {
- get_page(virt_to_page(spte));
- break;
- }
- }
- }
-
- if (!spte)
- goto out;
-
- spin_lock(&mm->page_table_lock);
- if (pud_none(*pud))
- pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
- else
- put_page(virt_to_page(spte));
- spin_unlock(&mm->page_table_lock);
-out:
- mutex_unlock(&mapping->i_mmap_mutex);
-}
-
-/*
- * unmap huge page backed by shared pte.
- *
- * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
- * called with vma->vm_mm->page_table_lock held.
- *
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
- */
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- pgd_t *pgd = pgd_offset(mm, *addr);
- pud_t *pud = pud_offset(pgd, *addr);
-
- BUG_ON(page_count(virt_to_page(ptep)) == 0);
- if (page_count(virt_to_page(ptep)) == 1)
- return 0;
-
- pud_clear(pud);
- put_page(virt_to_page(ptep));
- *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
- return 1;
-}
-
-pte_t *huge_pte_alloc(struct mm_struct *mm,
- unsigned long addr, unsigned long sz)
-{
- pgd_t *pgd;
- pud_t *pud;
- pte_t *pte = NULL;
-
- pgd = pgd_offset(mm, addr);
- pud = pud_alloc(mm, pgd, addr);
- if (pud) {
- if (sz == PUD_SIZE) {
- pte = (pte_t *)pud;
- } else {
- BUG_ON(sz != PMD_SIZE);
- if (pud_none(*pud))
- huge_pmd_share(mm, addr, pud);
- pte = (pte_t *) pmd_alloc(mm, pud, addr);
- }
- }
- BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
-
- return pte;
-}
-
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd = NULL;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_present(*pgd)) {
- pud = pud_offset(pgd, addr);
- if (pud_present(*pud)) {
- if (pud_large(*pud))
- return (pte_t *)pud;
- pmd = pmd_offset(pud, addr);
- }
- }
- return (pte_t *) pmd;
-}
-
#if 0 /* This is just for testing */
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
@@ -211,7 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
{
return NULL;
}
-
#else
struct page *
@@ -229,77 +75,23 @@ int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
-
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
-{
- struct page *page;
-
- page = pte_page(*(pte_t *)pmd);
- if (page)
- page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
- return page;
-}
-
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- struct page *page;
-
- page = pte_page(*(pte_t *)pud);
- if (page)
- page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
- return page;
-}
-
#endif
-/* x86_64 also uses this file */
-
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long start_addr;
+ struct vm_unmapped_area_info info;
- if (len > mm->cached_hole_size) {
- start_addr = mm->free_area_cache;
- } else {
- start_addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- }
-
-full_search:
- addr = ALIGN(start_addr, huge_page_size(h));
-
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != TASK_UNMAPPED_BASE) {
- start_addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = ALIGN(vma->vm_end, huge_page_size(h));
- }
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = current->mm->mmap_legacy_base;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
}
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -307,89 +99,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev_vma;
- unsigned long base = mm->mmap_base, addr = addr0;
- unsigned long largest_hole = mm->cached_hole_size;
- int first_time = 1;
-
- /* don't allow allocations above current base */
- if (mm->free_area_cache > base)
- mm->free_area_cache = base;
-
- if (len <= largest_hole) {
- largest_hole = 0;
- mm->free_area_cache = base;
- }
-try_again:
- /* make sure it can fit in the remaining address space */
- if (mm->free_area_cache < len)
- goto fail;
+ struct vm_unmapped_area_info info;
+ unsigned long addr;
- /* either no address requested or can't fit in requested address hole */
- addr = (mm->free_area_cache - len) & huge_page_mask(h);
- do {
- /*
- * Lookup failure means no vma is above this address,
- * i.e. return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma)
- return addr;
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = current->mm->mmap_base;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
- /*
- * new region fits between prev_vma->vm_end and
- * vma->vm_start, use it:
- */
- prev_vma = vma->vm_prev;
- if (addr + len <= vma->vm_start &&
- (!prev_vma || (addr >= prev_vma->vm_end))) {
- /* remember the address as a hint for next time */
- mm->cached_hole_size = largest_hole;
- return (mm->free_area_cache = addr);
- } else {
- /* pull free_area_cache down to the first hole */
- if (mm->free_area_cache == vma->vm_end) {
- mm->free_area_cache = vma->vm_start;
- mm->cached_hole_size = largest_hole;
- }
- }
-
- /* remember the largest hole we saw so far */
- if (addr + largest_hole < vma->vm_start)
- largest_hole = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = (vma->vm_start - len) & huge_page_mask(h);
- } while (len <= vma->vm_start);
-
-fail:
- /*
- * if hint left us with no space for the requested
- * mapping then try again:
- */
- if (first_time) {
- mm->free_area_cache = base;
- largest_hole = 0;
- first_time = 0;
- goto try_again;
- }
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = ~0UL;
- addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
- len, pgoff, flags);
-
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = base;
- mm->cached_hole_size = ~0UL;
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
return addr;
}
@@ -427,8 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
}
-
-#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6cabf6570d6..f9713061811 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -12,77 +12,103 @@
#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/setup.h>
-#include <asm/system.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/proto.h>
#include <asm/dma.h> /* for MAX_DMA_PFN */
+#include <asm/microcode.h>
-unsigned long __initdata pgt_buf_start;
-unsigned long __meminitdata pgt_buf_end;
-unsigned long __meminitdata pgt_buf_top;
+#include "mm_internal.h"
-int after_bootmem;
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
+static unsigned long min_pfn_mapped;
-static void __init find_early_table_space(unsigned long end, int use_pse,
- int use_gbpages)
-{
- unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
- phys_addr_t base;
+static bool __initdata can_use_brk_pgt = true;
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+/*
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
+ */
+__ref void *alloc_low_pages(unsigned int num)
+{
+ unsigned long pfn;
+ int i;
- if (use_gbpages) {
- unsigned long extra;
+ if (after_bootmem) {
+ unsigned int order;
- extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
- pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+ order = get_order((unsigned long)num << PAGE_SHIFT);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+ __GFP_ZERO, order);
+ }
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+ if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
+ unsigned long ret;
+ if (min_pfn_mapped >= max_pfn_mapped)
+ panic("alloc_low_pages: ran out of memory");
+ ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ max_pfn_mapped << PAGE_SHIFT,
+ PAGE_SIZE * num , PAGE_SIZE);
+ if (!ret)
+ panic("alloc_low_pages: can not alloc memory");
+ memblock_reserve(ret, PAGE_SIZE * num);
+ pfn = ret >> PAGE_SHIFT;
+ } else {
+ pfn = pgt_buf_end;
+ pgt_buf_end += num;
+ printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+ pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
+ }
- if (use_pse) {
- unsigned long extra;
+ for (i = 0; i < num; i++) {
+ void *adr;
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
- extra += PMD_SIZE;
-#endif
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ adr = __va((pfn + i) << PAGE_SHIFT);
+ clear_page(adr);
+ }
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+ return __va(pfn << PAGE_SHIFT);
+}
-#ifdef CONFIG_X86_32
- /* for fixmap */
- tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
- good_end = max_pfn_mapped << PAGE_SHIFT;
+/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void __init early_alloc_pgt_buf(void)
+{
+ unsigned long tables = INIT_PGT_BUF_SIZE;
+ phys_addr_t base;
- base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
- if (!base)
- panic("Cannot find space for the kernel page tables");
+ base = __pa(extend_brk(tables, PAGE_SIZE));
pgt_buf_start = base >> PAGE_SHIFT;
pgt_buf_end = pgt_buf_start;
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
-
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
}
-void __init native_pagetable_reserve(u64 start, u64 end)
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+
+static void __init init_gbpages(void)
{
- memblock_reserve(start, end - start);
+#ifdef CONFIG_X86_64
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+#endif
}
struct map_range {
@@ -91,6 +117,35 @@ struct map_range {
unsigned page_size_mask;
};
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
+{
+ init_gbpages();
+
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ if (direct_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (cpu_has_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+}
+
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
@@ -114,57 +169,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
}
/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
+ * adjust the page_size_mask for small range to go with
+ * big page size instead small one if nearby are ram too.
*/
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+ int nr_range)
{
- unsigned long page_size_mask = 0;
- unsigned long start_pfn, end_pfn;
- unsigned long ret = 0;
- unsigned long pos;
-
- struct map_range mr[NR_RANGE_MR];
- int nr_range, i;
- int use_pse, use_gbpages;
+ int i;
- printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+ for (i = 0; i < nr_range; i++) {
+ if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+ unsigned long start = round_down(mr[i].start, PMD_SIZE);
+ unsigned long end = round_up(mr[i].end, PMD_SIZE);
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- use_pse = use_gbpages = 0;
-#else
- use_pse = cpu_has_pse;
- use_gbpages = direct_gbpages;
+#ifdef CONFIG_X86_32
+ if ((end >> PAGE_SHIFT) > max_low_pfn)
+ continue;
#endif
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
-
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+ }
+ if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+ unsigned long start = round_down(mr[i].start, PUD_SIZE);
+ unsigned long end = round_up(mr[i].end, PUD_SIZE);
+
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+ }
}
+}
- if (use_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
- if (use_pse)
- page_size_mask |= 1 << PG_LEVEL_2M;
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long start_pfn, end_pfn, limit_pfn;
+ unsigned long pfn;
+ int i;
- memset(mr, 0, sizeof(mr));
- nr_range = 0;
+ limit_pfn = PFN_DOWN(end);
/* head if not big page alignment ? */
- start_pfn = start >> PAGE_SHIFT;
- pos = start_pfn << PAGE_SHIFT;
+ pfn = start_pfn = PFN_DOWN(start);
#ifdef CONFIG_X86_32
/*
* Don't use a large page for the first 2/4MB of memory
@@ -172,68 +221,65 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* and overlapping MTRRs into large pages can cause
* slowdowns.
*/
- if (pos == 0)
- end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ if (pfn == 0)
+ end_pfn = PFN_DOWN(PMD_SIZE);
else
- end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#endif
- if (end_pfn > (end >> PAGE_SHIFT))
- end_pfn = end >> PAGE_SHIFT;
+ if (end_pfn > limit_pfn)
+ end_pfn = limit_pfn;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* big page (2M) range */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#ifdef CONFIG_X86_64
/* big page (1G) range */
- start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* tail is not big page (1G) alignment */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#endif
/* tail is not big page (2M) alignment */
- start_pfn = pos>>PAGE_SHIFT;
- end_pfn = end>>PAGE_SHIFT;
+ start_pfn = pfn;
+ end_pfn = limit_pfn;
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ if (!after_bootmem)
+ adjust_range_page_size_mask(mr, nr_range);
+
/* try to merge same page size and continuous */
for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
unsigned long old_start;
@@ -249,58 +295,279 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
}
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG " %010lx - %010lx page %s\n",
- mr[i].start, mr[i].end,
+ printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+ mr[i].start, mr[i].end - 1,
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
- /*
- * Find space for the kernel direct mapping tables.
- *
- * Later we should allocate these tables in the local node of the
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
- if (!after_bootmem)
- find_early_table_space(end, use_pse, use_gbpages);
+ return nr_range;
+}
+
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+ nr_pfn_mapped, start_pfn, end_pfn);
+ nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+ max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+ if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+ max_low_pfn_mapped = max(max_low_pfn_mapped,
+ min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_pfn_mapped; i++)
+ if ((start_pfn >= pfn_mapped[i].start) &&
+ (end_pfn <= pfn_mapped[i].end))
+ return true;
+
+ return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ struct map_range mr[NR_RANGE_MR];
+ unsigned long ret = 0;
+ int nr_range, i;
+
+ pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+ start, end - 1);
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = split_mem_range(mr, 0, start, end);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
-#ifdef CONFIG_X86_32
- early_ioremap_page_table_range_init();
+ add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
- load_cr3(swapper_pg_dir);
-#endif
+ return ret >> PAGE_SHIFT;
+}
- __flush_tlb_all();
+/*
+ * We need to iterate through the E820 memory map and create direct mappings
+ * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply
+ * create direct mappings for all pfns from [0 to max_low_pfn) and
+ * [4GB to max_pfn) because of possible memory holes in high addresses
+ * that cannot be marked as UC by fixed/variable range MTRRs.
+ * Depending on the alignment of E820 ranges, this may possibly result
+ * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
+ *
+ * init_mem_mapping() calls init_range_memory_mapping() with big range.
+ * That range would have hole in the middle or ends, and only ram parts
+ * will be mapped in init_range_memory_mapping().
+ */
+static unsigned long __init init_range_memory_mapping(
+ unsigned long r_start,
+ unsigned long r_end)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long mapped_ram_size = 0;
+ int i;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+ u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+ if (start >= end)
+ continue;
+
+ /*
+ * if it is overlapping with brk pgt, we need to
+ * alloc pgt buf from memblock instead.
+ */
+ can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
+ min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
+ init_memory_mapping(start, end);
+ mapped_ram_size += end - start;
+ can_use_brk_pgt = true;
+ }
+
+ return mapped_ram_size;
+}
+
+static unsigned long __init get_new_step_size(unsigned long step_size)
+{
/*
- * Reserve the kernel pagetable pages we used (pgt_buf_start -
- * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
- * so that they can be reused for other purposes.
+ * Explain why we shift by 5 and why we don't have to worry about
+ * 'step_size << 5' overflowing:
*
- * On native it just means calling memblock_reserve, on Xen it also
- * means marking RW the pagetable pages that we allocated before
- * but that haven't been used.
+ * initial mapped size is PMD_SIZE (2M).
+ * We can not set step_size to be PUD_SIZE (1G) yet.
+ * In worse case, when we cross the 1G boundary, and
+ * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
+ * to map 1G range with PTE. Use 5 as shift for now.
*
- * In fact on xen we mark RO the whole range pgt_buf_start -
- * pgt_buf_top, because we have to make sure that when
- * init_memory_mapping reaches the pagetable pages area, it maps
- * RO all the pagetable pages, including the ones that are beyond
- * pgt_buf_end at that time.
+ * Don't need to worry about overflow, on 32bit, when step_size
+ * is 0, round_down() returns 0 for start, and that turns it
+ * into 0x100000000ULL.
*/
- if (!after_bootmem && pgt_buf_end > pgt_buf_start)
- x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
- PFN_PHYS(pgt_buf_end));
+ return step_size << 5;
+}
- if (!after_bootmem)
- early_memtest(start, end);
+/**
+ * memory_map_top_down - Map [map_start, map_end) top down
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in top-down. That said, the page tables
+ * will be allocated at the end of the memory, and we map the
+ * memory in top-down.
+ */
+static void __init memory_map_top_down(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long real_end, start, last_start;
+ unsigned long step_size;
+ unsigned long addr;
+ unsigned long mapped_ram_size = 0;
+ unsigned long new_mapped_ram_size;
- return ret >> PAGE_SHIFT;
+ /* xen has big range in reserved near end of ram, skip it at first.*/
+ addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+ real_end = addr + PMD_SIZE;
+
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ step_size = PMD_SIZE;
+ max_pfn_mapped = 0; /* will get exact value next */
+ min_pfn_mapped = real_end >> PAGE_SHIFT;
+ last_start = start = real_end;
+
+ /*
+ * We start from the top (end of memory) and go to the bottom.
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (last_start > map_start) {
+ if (last_start > step_size) {
+ start = round_down(last_start - 1, step_size);
+ if (start < map_start)
+ start = map_start;
+ } else
+ start = map_start;
+ new_mapped_ram_size = init_range_memory_mapping(start,
+ last_start);
+ last_start = start;
+ min_pfn_mapped = last_start >> PAGE_SHIFT;
+ /* only increase step_size after big range get mapped */
+ if (new_mapped_ram_size > mapped_ram_size)
+ step_size = get_new_step_size(step_size);
+ mapped_ram_size += new_mapped_ram_size;
+ }
+
+ if (real_end < map_end)
+ init_range_memory_mapping(real_end, map_end);
}
+/**
+ * memory_map_bottom_up - Map [map_start, map_end) bottom up
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in bottom-up. Since we have limited the
+ * bottom-up allocation above the kernel, the page tables will
+ * be allocated just above the kernel and we map the memory
+ * in [map_start, map_end) in bottom-up.
+ */
+static void __init memory_map_bottom_up(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long next, new_mapped_ram_size, start;
+ unsigned long mapped_ram_size = 0;
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ unsigned long step_size = PMD_SIZE;
+
+ start = map_start;
+ min_pfn_mapped = start >> PAGE_SHIFT;
+
+ /*
+ * We start from the bottom (@map_start) and go to the top (@map_end).
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (start < map_end) {
+ if (map_end - start > step_size) {
+ next = round_up(start + 1, step_size);
+ if (next > map_end)
+ next = map_end;
+ } else
+ next = map_end;
+
+ new_mapped_ram_size = init_range_memory_mapping(start, next);
+ start = next;
+
+ if (new_mapped_ram_size > mapped_ram_size)
+ step_size = get_new_step_size(step_size);
+ mapped_ram_size += new_mapped_ram_size;
+ }
+}
+
+void __init init_mem_mapping(void)
+{
+ unsigned long end;
+
+ probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+ end = max_pfn << PAGE_SHIFT;
+#else
+ end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+ /* the ISA range is always mapped regardless of memory holes */
+ init_memory_mapping(0, ISA_END_ADDRESS);
+
+ /*
+ * If the allocation is in bottom-up direction, we setup direct mapping
+ * in bottom-up, otherwise we setup direct mapping in top-down.
+ */
+ if (memblock_bottom_up()) {
+ unsigned long kernel_end = __pa_symbol(_end);
+
+ /*
+ * we need two separate calls here. This is because we want to
+ * allocate page tables above the kernel. So we first map
+ * [kernel_end, end) to make memory above the kernel be mapped
+ * as soon as possible. And then use page tables allocated above
+ * the kernel to map [ISA_END_ADDRESS, kernel_end).
+ */
+ memory_map_bottom_up(kernel_end, end);
+ memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
+ } else {
+ memory_map_top_down(ISA_END_ADDRESS, end);
+ }
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
+ }
+#else
+ early_ioremap_page_table_range_init();
+#endif
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+
+ early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
/*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
@@ -314,7 +581,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
*/
int devmem_is_allowed(unsigned long pagenr)
{
- if (pagenr <= 256)
+ if (pagenr < 256)
return 1;
if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
return 0;
@@ -325,7 +592,6 @@ int devmem_is_allowed(unsigned long pagenr)
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
- unsigned long addr;
unsigned long begin_aligned, end_aligned;
/* Make sure boundaries are page aligned */
@@ -340,16 +606,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
if (begin >= end)
return;
- addr = begin;
-
/*
* If debugging page accesses then do not free this memory but
* mark them not present - any buggy init-section access will
* create a kernel page fault:
*/
#ifdef CONFIG_DEBUG_PAGEALLOC
- printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
- begin, end);
+ printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
+ begin, end - 1);
set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
/*
@@ -360,28 +624,29 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
- printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-
- for (; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- free_page(addr);
- totalram_pages++;
- }
+ free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
#endif
}
void free_initmem(void)
{
- free_init_pages("unused kernel memory",
+ free_init_pages("unused kernel",
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
}
#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
+void __init free_initrd_mem(unsigned long start, unsigned long end)
{
+#ifdef CONFIG_MICROCODE_EARLY
+ /*
+ * Remember, initrd memory may contain microcode or other useful things.
+ * Before we lose initrd mem, we need to find a place to hold them
+ * now that normal virtual memory is enabled.
+ */
+ save_microcode_in_initrd();
+#endif
+
/*
* end could be not aligned, and We can not align that,
* decompresser could be confused by aligned initrd_end
@@ -391,7 +656,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
* - relocate_initrd()
* So here We can do PAGE_ALIGN() safely to get partial page to be freed
*/
- free_init_pages("initrd memory", start, PAGE_ALIGN(end));
+ free_init_pages("initrd", start, PAGE_ALIGN(end));
}
#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8663f6c47cc..e39504878ae 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -35,7 +35,6 @@
#include <asm/asm.h>
#include <asm/bios_ebda.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/dma.h>
@@ -54,25 +53,14 @@
#include <asm/page_types.h>
#include <asm/init.h>
+#include "mm_internal.h"
+
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
bool __read_mostly __vmalloc_start_set = false;
-static __init void *alloc_low_page(void)
-{
- unsigned long pfn = pgt_buf_end++;
- void *adr;
-
- if (pfn >= pgt_buf_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = __va(pfn * PAGE_SIZE);
- clear_page(adr);
- return adr;
-}
-
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
@@ -85,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_bootmem)
- pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
- else
- pmd_table = (pmd_t *)alloc_low_page();
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
@@ -110,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
static pte_t * __init one_page_table_init(pmd_t *pmd)
{
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
- pte_t *page_table = NULL;
-
- if (after_bootmem) {
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
-#endif
- if (!page_table)
- page_table =
- (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
- } else
- page_table = (pte_t *)alloc_low_page();
+ pte_t *page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -147,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
return one_page_table_init(pmd) + pte_idx;
}
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+ unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+
+ if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+ return 0;
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd_idx++) {
+ if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+ (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+ count++;
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+#endif
+ return count;
+}
+
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
- unsigned long vaddr, pte_t *lastpte)
+ unsigned long vaddr, pte_t *lastpte,
+ void **adr)
{
#ifdef CONFIG_HIGHMEM
/*
@@ -162,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
- && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
- || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
pte_t *newpte;
int i;
BUG_ON(after_bootmem);
- newpte = alloc_low_page();
+ newpte = *adr;
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
+ *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -205,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte = NULL;
+ unsigned long count = page_table_range_init_count(start, end);
+ void *adr = NULL;
+
+ if (count)
+ adr = alloc_low_pages(count);
vaddr = start;
pgd_idx = pgd_index(vaddr);
@@ -217,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
pte = page_table_kmap_check(one_page_table_init(pmd),
- pmd, vaddr, pte);
+ pmd, vaddr, pte, &adr);
vaddr += PMD_SIZE;
}
@@ -311,6 +321,7 @@ repeat:
__pgprot(PTE_IDENT_ATTR |
_PAGE_PSE);
+ pfn &= PMD_MASK >> PAGE_SHIFT;
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
@@ -416,14 +427,6 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pkmap_page_table = pte;
}
-static void __init add_one_highpage_init(struct page *page)
-{
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
-}
-
void __init add_highpages_with_active_regions(int nid,
unsigned long start_pfn, unsigned long end_pfn)
{
@@ -437,7 +440,7 @@ void __init add_highpages_with_active_regions(int nid,
start_pfn, end_pfn);
for ( ; pfn < e_pfn; pfn++)
if (pfn_valid(pfn))
- add_one_highpage_init(pfn_to_page(pfn));
+ free_highmem_page(pfn_to_page(pfn));
}
}
#else
@@ -446,19 +449,24 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base)
}
#endif /* CONFIG_HIGHMEM */
-void __init native_pagetable_setup_start(pgd_t *base)
+void __init native_pagetable_init(void)
{
unsigned long pfn, va;
- pgd_t *pgd;
+ pgd_t *pgd, *base = swapper_pg_dir;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
/*
* Remove any mappings which extend past the end of physical
- * memory from the boot time page table:
+ * memory from the boot time page table.
+ * In virtual address space, we should have at least two pages
+ * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+ * definition. And max_low_pfn is set to VMALLOC_END physical
+ * address. If initial memory mapping is doing right job, we
+ * should have pte used near max_low_pfn or one pmd is not present.
*/
- for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
pgd = base + pgd_index(va);
if (!pgd_present(*pgd))
@@ -469,17 +477,23 @@ void __init native_pagetable_setup_start(pgd_t *base)
if (!pmd_present(*pmd))
break;
+ /* should not be large page here */
+ if (pmd_large(*pmd)) {
+ pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+ pfn, pmd, __pa(pmd));
+ BUG_ON(1);
+ }
+
pte = pte_offset_kernel(pmd, va);
if (!pte_present(*pte))
break;
+ printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+ pfn, pmd, __pa(pmd), pte, __pa(pte));
pte_clear(NULL, va, pte);
}
paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
-}
-
-void __init native_pagetable_setup_done(pgd_t *base)
-{
+ paging_init();
}
/*
@@ -494,7 +508,7 @@ void __init native_pagetable_setup_done(pgd_t *base)
* If we're booting paravirtualized under a hypervisor, then there are
* more options: we may already be running PAE, and the pagetable may
* or may not be based in swapper_pg_dir. In any case,
- * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * paravirt_pagetable_init() will set up swapper_pg_dir
* appropriately for the rest of the initialization to work.
*
* In general, pagetable_init() assumes that the pagetable may already
@@ -554,7 +568,7 @@ early_param("highmem", parse_highmem);
* artificially via the highmem=x boot parameter then create
* it:
*/
-void __init lowmem_pfn_init(void)
+static void __init lowmem_pfn_init(void)
{
/* max_low_pfn is 0, we already have early_res support */
max_low_pfn = max_pfn;
@@ -590,7 +604,7 @@ void __init lowmem_pfn_init(void)
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
*/
-void __init highmem_pfn_init(void)
+static void __init highmem_pfn_init(void)
{
max_low_pfn = MAXMEM_PFN;
@@ -646,18 +660,16 @@ void __init initmem_init(void)
highstart_pfn = max_low_pfn;
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
- num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
sparse_memory_present_with_active_regions(0);
#ifdef CONFIG_FLATMEM
- max_mapnr = num_physpages;
+ max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
#endif
__vmalloc_start_set = true;
@@ -673,8 +685,6 @@ void __init setup_bootmem_allocator(void)
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-
- after_bootmem = 1;
}
/*
@@ -713,16 +723,13 @@ static void __init test_wp_bit(void)
"Checking if this processor honours the WP bit even in supervisor mode...");
/* Any page-aligned address will do, the test is non-destructive */
- __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
boot_cpu_data.wp_works_ok = do_test_wp_bit();
clear_fixmap(FIX_WP_TEST);
if (!boot_cpu_data.wp_works_ok) {
printk(KERN_CONT "No.\n");
-#ifdef CONFIG_X86_WP_WORKS_OK
- panic(
- "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
-#endif
+ panic("Linux doesn't support CPUs with broken WP.");
} else {
printk(KERN_CONT "Ok.\n");
}
@@ -730,9 +737,6 @@ static void __init test_wp_bit(void)
void __init mem_init(void)
{
- int codesize, reservedpages, datasize, initsize;
- int tmp;
-
pci_iommu_alloc();
#ifdef CONFIG_FLATMEM
@@ -750,30 +754,11 @@ void __init mem_init(void)
set_highmem_pages_init();
/* this will put all low memory onto the freelists */
- totalram_pages += free_all_bootmem();
+ free_all_bootmem();
- reservedpages = 0;
- for (tmp = 0; tmp < max_low_pfn; tmp++)
- /*
- * Only count reserved RAM pages:
- */
- if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
- reservedpages++;
-
- codesize = (unsigned long) &_etext - (unsigned long) &_text;
- datasize = (unsigned long) &_edata - (unsigned long) &_etext;
- initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
-
- printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
- "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
- nr_free_pages() << (PAGE_SHIFT-10),
- num_physpages << (PAGE_SHIFT-10),
- codesize >> 10,
- reservedpages << (PAGE_SHIFT-10),
- datasize >> 10,
- initsize >> 10,
- totalhigh_pages << (PAGE_SHIFT-10));
+ after_bootmem = 1;
+ mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
@@ -821,6 +806,9 @@ void __init mem_init(void)
BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
#undef high_memory
#undef __FIXADDR_TOP
+#ifdef CONFIG_RANDOMIZE_BASE
+ BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
+#endif
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
@@ -843,6 +831,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
return __add_pages(nid, zone, start_pfn, nr_pages);
}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
#endif
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 436a0309db3..df1a9927ad2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -32,10 +32,10 @@
#include <linux/memory_hotplug.h>
#include <linux/nmi.h>
#include <linux/gfp.h>
+#include <linux/kcore.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -55,6 +55,82 @@
#include <asm/uv/uv.h>
#include <asm/setup.h>
+#include "mm_internal.h"
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+ unsigned long addr, unsigned long end)
+{
+ addr &= PMD_MASK;
+ for (; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd = pmd_page + pmd_index(addr);
+
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(addr | pmd_flag));
+ }
+}
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next) {
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pud_present(*pud)) {
+ pmd = pmd_offset(pud, 0);
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ continue;
+ }
+ pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+ if (!pmd)
+ return -ENOMEM;
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ int result;
+ int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+ for (; addr < end; addr = next) {
+ pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+ pud_t *pud;
+
+ next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, 0);
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ continue;
+ }
+
+ pud = (pud_t *)info->alloc_pgt_page(info->context);
+ if (!pud)
+ return -ENOMEM;
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -292,7 +368,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
*
* from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
*
- * phys_addr holds the negative offset to the kernel, which is added
+ * phys_base holds the negative offset to the kernel, which is added
* to the compile time generated pmds. This results in invalid pmds up
* to the point where we hit the physaddr 0 mapping.
*
@@ -303,10 +379,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
- unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+ unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
+ /*
+ * Native path, max_pfn_mapped is not set yet.
+ * Xen has valid max_pfn_mapped set in
+ * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
+ */
+ if (max_pfn_mapped)
+ vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+
for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
if (pmd_none(*pmd))
continue;
@@ -315,69 +399,24 @@ void __init cleanup_highmap(void)
}
}
-static __ref void *alloc_low_page(unsigned long *phys)
-{
- unsigned long pfn = pgt_buf_end++;
- void *adr;
-
- if (after_bootmem) {
- adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
- *phys = __pa(adr);
-
- return adr;
- }
-
- if (pfn >= pgt_buf_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
- clear_page(adr);
- *phys = pfn * PAGE_SIZE;
- return adr;
-}
-
-static __ref void *map_low_page(void *virt)
-{
- void *adr;
- unsigned long phys, left;
-
- if (after_bootmem)
- return virt;
-
- phys = __pa(virt);
- left = phys & (PAGE_SIZE - 1);
- adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
- adr = (void *)(((unsigned long)adr) | left);
-
- return adr;
-}
-
-static __ref void unmap_low_page(void *adr)
-{
- if (after_bootmem)
- return;
-
- early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
-}
-
static unsigned long __meminit
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
pgprot_t prot)
{
- unsigned pages = 0;
+ unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i;
pte_t *pte = pte_page + pte_index(addr);
- for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
-
+ for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+ next = (addr & PAGE_MASK) + PAGE_SIZE;
if (addr >= end) {
- if (!after_bootmem) {
- for(; i < PTRS_PER_PTE; i++, pte++)
- set_pte(pte, __pte(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+ set_pte(pte, __pte(0));
+ continue;
}
/*
@@ -387,7 +426,8 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
* these mappings are more intelligent.
*/
if (pte_val(*pte)) {
- pages++;
+ if (!after_bootmem)
+ pages++;
continue;
}
@@ -408,32 +448,31 @@ static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
unsigned long page_size_mask, pgprot_t prot)
{
- unsigned long pages = 0;
+ unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i = pmd_index(address);
- for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
- unsigned long pte_phys;
+ for (; i < PTRS_PER_PMD; i++, address = next) {
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
pgprot_t new_prot = prot;
+ next = (address & PMD_MASK) + PMD_SIZE;
if (address >= end) {
- if (!after_bootmem) {
- for (; i < PTRS_PER_PMD; i++, pmd++)
- set_pmd(pmd, __pmd(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+ set_pmd(pmd, __pmd(0));
+ continue;
}
if (pmd_val(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
- pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
last_map_addr = phys_pte_init(pte, address,
end, prot);
- unmap_low_page(pte);
spin_unlock(&init_mm.page_table_lock);
continue;
}
@@ -450,7 +489,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
* attributes.
*/
if (page_size_mask & (1 << PG_LEVEL_2M)) {
- pages++;
+ if (!after_bootmem)
+ pages++;
+ last_map_addr = next;
continue;
}
new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
@@ -460,19 +501,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte(address >> PAGE_SHIFT,
+ pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
- last_map_addr = (address & PMD_MASK) + PMD_SIZE;
+ last_map_addr = next;
continue;
}
- pte = alloc_low_page(&pte_phys);
+ pte = alloc_low_page();
last_map_addr = phys_pte_init(pte, address, end, new_prot);
- unmap_low_page(pte);
spin_lock(&init_mm.page_table_lock);
- pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+ pmd_populate_kernel(&init_mm, pmd, pte);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
@@ -483,31 +523,29 @@ static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
unsigned long page_size_mask)
{
- unsigned long pages = 0;
+ unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i = pud_index(addr);
- for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
- unsigned long pmd_phys;
+ for (; i < PTRS_PER_PUD; i++, addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
- if (addr >= end)
- break;
-
- if (!after_bootmem &&
- !e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
- set_pud(pud, __pud(0));
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (addr >= end) {
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+ set_pud(pud, __pud(0));
continue;
}
if (pud_val(*pud)) {
if (!pud_large(*pud)) {
- pmd = map_low_page(pmd_offset(pud, 0));
+ pmd = pmd_offset(pud, 0);
last_map_addr = phys_pmd_init(pmd, addr, end,
page_size_mask, prot);
- unmap_low_page(pmd);
__flush_tlb_all();
continue;
}
@@ -524,7 +562,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
* attributes.
*/
if (page_size_mask & (1 << PG_LEVEL_1G)) {
- pages++;
+ if (!after_bootmem)
+ pages++;
+ last_map_addr = next;
continue;
}
prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
@@ -534,19 +574,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pud,
- pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE));
spin_unlock(&init_mm.page_table_lock);
- last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
+ last_map_addr = next;
continue;
}
- pmd = alloc_low_page(&pmd_phys);
+ pmd = alloc_low_page();
last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
prot);
- unmap_low_page(pmd);
spin_lock(&init_mm.page_table_lock);
- pud_populate(&init_mm, pud, __va(pmd_phys));
+ pud_populate(&init_mm, pud, pmd);
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
@@ -571,34 +611,29 @@ kernel_physical_mapping_init(unsigned long start,
for (; start < end; start = next) {
pgd_t *pgd = pgd_offset_k(start);
- unsigned long pud_phys;
pud_t *pud;
- next = (start + PGDIR_SIZE) & PGDIR_MASK;
- if (next > end)
- next = end;
+ next = (start & PGDIR_MASK) + PGDIR_SIZE;
if (pgd_val(*pgd)) {
- pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
last_map_addr = phys_pud_init(pud, __pa(start),
__pa(end), page_size_mask);
- unmap_low_page(pud);
continue;
}
- pud = alloc_low_page(&pud_phys);
- last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+ pud = alloc_low_page();
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
page_size_mask);
- unmap_low_page(pud);
spin_lock(&init_mm.page_table_lock);
- pgd_populate(&init_mm, pgd, __va(pud_phys));
+ pgd_populate(&init_mm, pgd, pud);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
if (pgd_changed)
- sync_global_pgds(addr, end);
+ sync_global_pgds(addr, end - 1);
__flush_tlb_all();
@@ -608,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start,
#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
}
#endif
@@ -623,7 +658,9 @@ void __init paging_init(void)
* numa support is not compiled in, and later node_set_state
* will not set it back.
*/
- node_clear_state(0, N_NORMAL_MEMORY);
+ node_clear_state(0, N_MEMORY);
+ if (N_MEMORY != N_NORMAL_MEMORY)
+ node_clear_state(0, N_NORMAL_MEMORY);
zone_sizes_init();
}
@@ -655,13 +692,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
- unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- last_mapped_pfn = init_memory_mapping(start, start + size);
- if (last_mapped_pfn > max_pfn_mapped)
- max_pfn_mapped = last_mapped_pfn;
+ init_memory_mapping(start, start + size);
ret = __add_pages(nid, zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
@@ -673,49 +708,357 @@ int arch_add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(arch_add_memory);
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+ unsigned long magic;
+ unsigned int nr_pages = 1 << order;
+
+ /* bootmem page has reserved flag */
+ if (PageReserved(page)) {
+ __ClearPageReserved(page);
+
+ magic = (unsigned long)page->lru.next;
+ if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+ while (nr_pages--)
+ put_page_bootmem(page++);
+ } else
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (pte_val(*pte))
+ return;
+ }
+
+ /* free a pte talbe */
+ free_pagetable(pmd_page(*pmd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (pmd_val(*pmd))
+ return;
+ }
+
+ /* free a pmd talbe */
+ free_pagetable(pud_page(*pud), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (pud_val(*pud))
+ return false;
+ }
+
+ /* free a pud table */
+ free_pagetable(pgd_page(*pgd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pgd_clear(pgd);
+ spin_unlock(&init_mm.page_table_lock);
+
+ return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte;
+ void *page_addr;
+ phys_addr_t phys_addr;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ /*
+ * We mapped [0,1G) memory as identity mapping when
+ * initializing, in arch/x86/kernel/head_64.S. These
+ * pagetables cannot be removed.
+ */
+ phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+ if (phys_addr < (phys_addr_t)0x40000000)
+ return;
+
+ if (IS_ALIGNED(addr, PAGE_SIZE) &&
+ IS_ALIGNED(next, PAGE_SIZE)) {
+ /*
+ * Do not free direct mapping pages since they were
+ * freed when offlining, or simplely not in use.
+ */
+ if (!direct)
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+
+ /* For non-direct mapping, pages means nothing. */
+ pages++;
+ } else {
+ /*
+ * If we are here, we are freeing vmemmap pages since
+ * direct mapped memory ranges to be freed are aligned.
+ *
+ * If we are not removing the whole page, it means
+ * other page structs in this page are being used and
+ * we canot remove them. So fill the unused page_structs
+ * with 0xFD, and remove the page when it is wholly
+ * filled with 0xFD.
+ */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pte_page(*pte));
+ if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+ }
+
+ /* Call free_pte_table() in remove_pmd_table(). */
+ flush_tlb_all();
+ if (direct)
+ update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte_base;
+ pmd_t *pmd;
+ void *page_addr;
+
+ pmd = pmd_start + pmd_index(addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (pmd_large(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pmd_page(*pmd));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PMD_SIZE)) {
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+ remove_pte_table(pte_base, addr, next, direct);
+ free_pte_table(pte_base, pmd);
+ }
+
+ /* Call free_pmd_table() in remove_pud_table(). */
+ if (direct)
+ update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pmd_t *pmd_base;
+ pud_t *pud;
+ void *page_addr;
+
+ pud = pud_start + pud_index(addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ if (!direct)
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pud_page(*pud));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PUD_SIZE)) {
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+ remove_pmd_table(pmd_base, addr, next, direct);
+ free_pmd_table(pmd_base, pud);
+ }
+
+ if (direct)
+ update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ bool pgd_changed = false;
+
+ for (; start < end; start = next) {
+ next = pgd_addr_end(start, end);
+
+ pgd = pgd_offset_k(start);
+ if (!pgd_present(*pgd))
+ continue;
+
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
+ remove_pud_table(pud, start, next, direct);
+ if (free_pud_table(pud, pgd))
+ pgd_changed = true;
+ }
+
+ if (pgd_changed)
+ sync_global_pgds(start, end - 1);
+
+ flush_tlb_all();
+}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end)
+{
+ remove_pagetable(start, end, false);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void __meminit
+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
+{
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+
+ remove_pagetable(start, end, true);
+}
+
+int __ref arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+ int ret;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ kernel_physical_mapping_remove(start, start + size);
+ ret = __remove_pages(zone, start_pfn, nr_pages);
+ WARN_ON_ONCE(ret);
+
+ return ret;
+}
+#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
static struct kcore_list kcore_vsyscall;
-void __init mem_init(void)
+static void __init register_page_bootmem_info(void)
{
- long codesize, reservedpages, datasize, initsize;
- unsigned long absent_pages;
+#ifdef CONFIG_NUMA
+ int i;
+
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+void __init mem_init(void)
+{
pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
- reservedpages = 0;
-
- /* this will put all low memory onto the freelists */
-#ifdef CONFIG_NUMA
- totalram_pages = numa_free_all_bootmem();
-#else
- totalram_pages = free_all_bootmem();
-#endif
+ register_page_bootmem_info();
- absent_pages = absent_pages_in_range(0, max_pfn);
- reservedpages = max_pfn - totalram_pages - absent_pages;
+ /* this will put all memory onto the freelists */
+ free_all_bootmem();
after_bootmem = 1;
- codesize = (unsigned long) &_etext - (unsigned long) &_text;
- datasize = (unsigned long) &_edata - (unsigned long) &_etext;
- initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
-
/* Register memory areas for /proc/kcore */
- kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
- VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
+ PAGE_SIZE, KCORE_OTHER);
- printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
- "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
- nr_free_pages() << (PAGE_SHIFT-10),
- max_pfn << (PAGE_SHIFT-10),
- codesize >> 10,
- absent_pages << (PAGE_SHIFT-10),
- reservedpages << (PAGE_SHIFT-10),
- datasize >> 10,
- initsize >> 10);
+ mem_init_print_info(NULL);
}
#ifdef CONFIG_DEBUG_RODATA
@@ -763,12 +1106,11 @@ void set_kernel_text_ro(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
- unsigned long rodata_start =
- ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ unsigned long rodata_start = PFN_ALIGN(__start_rodata);
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
- unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
- unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
- unsigned long data_start = (unsigned long) &_sdata;
+ unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
+ unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+ unsigned long all_end = PFN_ALIGN(&_end);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -777,10 +1119,10 @@ void mark_rodata_ro(void)
kernel_set_to_readonly = 1;
/*
- * The rodata section (but not the kernel text!) should also be
- * not-executable.
+ * The rodata/data/bss/brk section (but not the kernel text!)
+ * should also be not-executable.
*/
- set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
rodata_test();
@@ -792,13 +1134,12 @@ void mark_rodata_ro(void)
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
- free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(text_end)),
- (unsigned long)
- page_address(virt_to_page(rodata_start)));
- free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(rodata_end)),
- (unsigned long) page_address(virt_to_page(data_start)));
+ free_init_pages("unused kernel",
+ (unsigned long) __va(__pa_symbol(text_end)),
+ (unsigned long) __va(__pa_symbol(rodata_start)));
+ free_init_pages("unused kernel",
+ (unsigned long) __va(__pa_symbol(rodata_end)),
+ (unsigned long) __va(__pa_symbol(_sdata)));
}
#endif
@@ -822,6 +1163,9 @@ int kern_addr_valid(unsigned long addr)
if (pud_none(*pud))
return 0;
+ if (pud_large(*pud))
+ return pfn_valid(pud_pfn(*pud));
+
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return 0;
@@ -841,11 +1185,19 @@ int kern_addr_valid(unsigned long addr)
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
* not need special handling anymore:
*/
+static const char *gate_vma_name(struct vm_area_struct *vma)
+{
+ return "[vsyscall]";
+}
+static struct vm_operations_struct gate_vma_ops = {
+ .name = gate_vma_name,
+};
static struct vm_area_struct gate_vma = {
- .vm_start = VSYSCALL_START,
- .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+ .vm_start = VSYSCALL_ADDR,
+ .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
- .vm_flags = VM_READ | VM_EXEC
+ .vm_flags = VM_READ | VM_EXEC,
+ .vm_ops = &gate_vma_ops,
};
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
@@ -874,29 +1226,46 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr)
*/
int in_gate_area_no_mm(unsigned long addr)
{
- return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+ return (addr & PAGE_MASK) == VSYSCALL_ADDR;
}
-const char *arch_vma_name(struct vm_area_struct *vma)
+static unsigned long probe_memory_block_size(void)
{
- if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
- return "[vdso]";
- if (vma == &gate_vma)
- return "[vsyscall]";
- return NULL;
-}
+ /* start from 2g */
+ unsigned long bz = 1UL<<31;
#ifdef CONFIG_X86_UV
-unsigned long memory_block_size_bytes(void)
-{
if (is_uv_system()) {
printk(KERN_INFO "UV: memory block size 2GB\n");
return 2UL * 1024 * 1024 * 1024;
}
- return MIN_MEMORY_BLOCK_SIZE;
-}
#endif
+ /* less than 64g installed */
+ if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
+ return MIN_MEMORY_BLOCK_SIZE;
+
+ /* get the tail size */
+ while (bz > MIN_MEMORY_BLOCK_SIZE) {
+ if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
+ break;
+ bz >>= 1;
+ }
+
+ printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+
+ return bz;
+}
+
+static unsigned long memory_block_size_probed;
+unsigned long memory_block_size_bytes(void)
+{
+ if (!memory_block_size_probed)
+ memory_block_size_probed = probe_memory_block_size();
+
+ return memory_block_size_probed;
+}
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
* Initialise the sparsemem vmemmap using huge-pages at the PMD level.
@@ -905,18 +1274,17 @@ static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;
-int __meminit
-vmemmap_populate(struct page *start_page, unsigned long size, int node)
+static int __meminit vmemmap_populate_hugepages(unsigned long start,
+ unsigned long end, int node)
{
- unsigned long addr = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + size);
+ unsigned long addr;
unsigned long next;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- for (; addr < end; addr = next) {
- void *p = NULL;
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
@@ -926,31 +1294,14 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
if (!pud)
return -ENOMEM;
- if (!cpu_has_pse) {
- next = (addr + PAGE_SIZE) & PAGE_MASK;
- pmd = vmemmap_pmd_populate(pud, addr, node);
-
- if (!pmd)
- return -ENOMEM;
-
- p = vmemmap_pte_populate(pmd, addr, node);
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ void *p;
- if (!p)
- return -ENOMEM;
-
- addr_end = addr + PAGE_SIZE;
- p_end = p + PAGE_SIZE;
- } else {
- next = pmd_addr_end(addr, end);
-
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ if (p) {
pte_t entry;
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
- if (!p)
- return -ENOMEM;
-
entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
PAGE_KERNEL_LARGE);
set_pmd(pmd, __pmd(pte_val(entry)));
@@ -967,15 +1318,92 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
- } else
- vmemmap_verify((pte_t *)pmd, node, addr, next);
+ continue;
+ }
+ } else if (pmd_large(*pmd)) {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ continue;
}
-
+ pr_warn_once("vmemmap: falling back to regular page backing\n");
+ if (vmemmap_populate_basepages(addr, next, node))
+ return -ENOMEM;
}
- sync_global_pgds((unsigned long)start_page, end);
return 0;
}
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+{
+ int err;
+
+ if (cpu_has_pse)
+ err = vmemmap_populate_hugepages(start, end, node);
+ else
+ err = vmemmap_populate_basepages(start, end, node);
+ if (!err)
+ sync_global_pgds(start, end - 1);
+ return err;
+}
+
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + size);
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned int nr_pages;
+ struct page *page;
+
+ for (; addr < end; addr = next) {
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+ if (!cpu_has_pse) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+ get_page_bootmem(section_nr, pmd_page(*pmd),
+ MIX_SECTION_INFO);
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ continue;
+ get_page_bootmem(section_nr, pte_page(*pte),
+ SECTION_INFO);
+ } else {
+ next = pmd_addr_end(addr, end);
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+
+ nr_pages = 1 << (get_order(PMD_SIZE));
+ page = pmd_page(*pmd);
+ while (nr_pages--)
+ get_page_bootmem(section_nr, page++,
+ SECTION_INFO);
+ }
+ }
+}
+#endif
+
void __meminit vmemmap_populate_print_last(void)
{
if (p_start) {
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index be1ef574ce9..baff1da354e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
+static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return 1;
+
+ WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
+
+ return 0;
+}
+
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
@@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
+ pfn = phys_addr >> PAGE_SHIFT;
last_pfn = last_addr >> PAGE_SHIFT;
- for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
- int is_ram = page_is_ram(pfn);
-
- if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
- return NULL;
- WARN_ON_ONCE(is_ram);
- }
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1)
+ return NULL;
/*
* Mappings have to be page-aligned
@@ -180,7 +192,7 @@ err_free_memtype:
/**
* ioremap_nocache - map bus memory into CPU space
- * @offset: bus address of the memory
+ * @phys_addr: bus address of the memory
* @size: size of the resource to map
*
* ioremap_nocache performs a platform specific sequence of operations to
@@ -217,7 +229,7 @@ EXPORT_SYMBOL(ioremap_nocache);
/**
* ioremap_wc - map memory into CPU space write combined
- * @offset: bus address of the memory
+ * @phys_addr: bus address of the memory
* @size: size of the resource to map
*
* This version of ioremap ensures that the memory is marked write combining.
@@ -282,12 +294,7 @@ void iounmap(volatile void __iomem *addr)
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
- read_lock(&vmlist_lock);
- for (p = vmlist; p; p = p->next) {
- if (p->addr == (void __force *)addr)
- break;
- }
- read_unlock(&vmlist_lock);
+ p = find_vm_area((void __force *)addr);
if (!p) {
printk(KERN_ERR "iounmap: bad address %p\n", addr);
@@ -333,17 +340,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
return;
}
-static int __initdata early_ioremap_debug;
-
-static int __init early_ioremap_debug_setup(char *str)
-{
- early_ioremap_debug = 1;
-
- return 0;
-}
-early_param("early_ioremap_debug", early_ioremap_debug_setup);
-
-static __initdata int after_paging_init;
static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
@@ -367,18 +363,17 @@ bool __init is_early_ioremap_ptep(pte_t *ptep)
return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
}
-static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
-
void __init early_ioremap_init(void)
{
pmd_t *pmd;
- int i;
- if (early_ioremap_debug)
- printk(KERN_INFO "early_ioremap_init()\n");
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#else
+ WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#endif
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+ early_ioremap_setup();
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
@@ -407,13 +402,8 @@ void __init early_ioremap_init(void)
}
}
-void __init early_ioremap_reset(void)
-{
- after_paging_init = 1;
-}
-
-static void __init __early_set_fixmap(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t flags)
+void __init __early_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
{
unsigned long addr = __fix_to_virt(idx);
pte_t *pte;
@@ -430,199 +420,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
pte_clear(&init_mm, addr, pte);
__flush_tlb_one(addr);
}
-
-static inline void __init early_set_fixmap(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t prot)
-{
- if (after_paging_init)
- __set_fixmap(idx, phys, prot);
- else
- __early_set_fixmap(idx, phys, prot);
-}
-
-static inline void __init early_clear_fixmap(enum fixed_addresses idx)
-{
- if (after_paging_init)
- clear_fixmap(idx);
- else
- __early_set_fixmap(idx, 0, __pgprot(0));
-}
-
-static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
-static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
-
-void __init fixup_early_ioremap(void)
-{
- int i;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (prev_map[i]) {
- WARN_ON(1);
- break;
- }
- }
-
- early_ioremap_init();
-}
-
-static int __init check_early_ioremap_leak(void)
-{
- int count = 0;
- int i;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- if (prev_map[i])
- count++;
-
- if (!count)
- return 0;
- WARN(1, KERN_WARNING
- "Debug warning: early ioremap leak of %d areas detected.\n",
- count);
- printk(KERN_WARNING
- "please boot with early_ioremap_debug and report the dmesg.\n");
-
- return 1;
-}
-late_initcall(check_early_ioremap_leak);
-
-static void __init __iomem *
-__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
-{
- unsigned long offset;
- resource_size_t last_addr;
- unsigned int nrpages;
- enum fixed_addresses idx0, idx;
- int i, slot;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- slot = -1;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (!prev_map[i]) {
- slot = i;
- break;
- }
- }
-
- if (slot < 0) {
- printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n",
- (u64)phys_addr, size);
- WARN_ON(1);
- return NULL;
- }
-
- if (early_ioremap_debug) {
- printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ",
- (u64)phys_addr, size, slot);
- dump_stack();
- }
-
- /* Don't allow wraparound or zero size */
- last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr) {
- WARN_ON(1);
- return NULL;
- }
-
- prev_size[slot] = size;
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr + 1) - phys_addr;
-
- /*
- * Mappings have to fit in the FIX_BTMAP area.
- */
- nrpages = size >> PAGE_SHIFT;
- if (nrpages > NR_FIX_BTMAPS) {
- WARN_ON(1);
- return NULL;
- }
-
- /*
- * Ok, go for it..
- */
- idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
- idx = idx0;
- while (nrpages > 0) {
- early_set_fixmap(idx, phys_addr, prot);
- phys_addr += PAGE_SIZE;
- --idx;
- --nrpages;
- }
- if (early_ioremap_debug)
- printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
-
- prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
- return prev_map[slot];
-}
-
-/* Remap an IO device */
-void __init __iomem *
-early_ioremap(resource_size_t phys_addr, unsigned long size)
-{
- return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
-}
-
-/* Remap memory */
-void __init __iomem *
-early_memremap(resource_size_t phys_addr, unsigned long size)
-{
- return __early_ioremap(phys_addr, size, PAGE_KERNEL);
-}
-
-void __init early_iounmap(void __iomem *addr, unsigned long size)
-{
- unsigned long virt_addr;
- unsigned long offset;
- unsigned int nrpages;
- enum fixed_addresses idx;
- int i, slot;
-
- slot = -1;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (prev_map[i] == addr) {
- slot = i;
- break;
- }
- }
-
- if (slot < 0) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
- addr, size);
- WARN_ON(1);
- return;
- }
-
- if (prev_size[slot] != size) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
- addr, size, slot, prev_size[slot]);
- WARN_ON(1);
- return;
- }
-
- if (early_ioremap_debug) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
- size, slot);
- dump_stack();
- }
-
- virt_addr = (unsigned long)addr;
- if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
- WARN_ON(1);
- return;
- }
- offset = virt_addr & ~PAGE_MASK;
- nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
-
- idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
- while (nrpages > 0) {
- early_clear_fixmap(idx);
- --idx;
- --nrpages;
- }
- prev_map[slot] = NULL;
-}
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index d87dd6d042d..dd89a13f105 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -78,10 +78,16 @@ early_initcall(kmemcheck_init);
*/
static int __init param_kmemcheck(char *str)
{
+ int val;
+ int ret;
+
if (!str)
return -EINVAL;
- sscanf(str, "%d", &kmemcheck_enabled);
+ ret = kstrtoint(str, 0, &val);
+ if (ret)
+ return ret;
+ kmemcheck_enabled = val;
return 0;
}
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 036efbea8b2..aef7140c006 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,3 +1,4 @@
+#include <linux/bug.h>
#include <linux/kernel.h>
#include "opcode.h"
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index e5d5e2ce9f7..637ab34ed63 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -11,7 +11,6 @@
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/hash.h>
-#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index c80b9fb9573..1e9da795767 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -9,6 +9,7 @@
#include <linux/memblock.h>
static u64 patterns[] __initdata = {
+ /* The first entry has to be 0 to leave memtest with zeroed memory */
0,
0xffffffffffffffffULL,
0x5555555555555555ULL,
@@ -73,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
u64 i;
phys_addr_t this_start, this_end;
- for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
this_start = clamp_t(phys_addr_t, this_start, start, end);
this_end = clamp_t(phys_addr_t, this_end, start, end);
if (this_start < this_end) {
@@ -110,15 +111,8 @@ void __init early_memtest(unsigned long start, unsigned long end)
return;
printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
- for (i = 0; i < memtest_pattern; i++) {
+ for (i = memtest_pattern-1; i < UINT_MAX; --i) {
idx = i % ARRAY_SIZE(patterns);
do_one_pass(patterns[idx], start, end);
}
-
- if (idx > 0) {
- printk(KERN_INFO "early_memtest: wipe out "
- "test pattern from memory\n");
- /* additional test with pattern 0 will do this */
- do_one_pass(0, start, end);
- }
}
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 00000000000..6b563a11889
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,19 @@
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+ return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 845df6835f9..25e7e1372bb 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -112,13 +112,13 @@ static unsigned long mmap_legacy_base(void)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
+ mm->mmap_legacy_base = mmap_legacy_base();
+ mm->mmap_base = mmap_base();
+
if (mmap_is_legacy()) {
- mm->mmap_base = mmap_legacy_base();
+ mm->mmap_base = mm->mmap_legacy_base;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
- mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index dc0b727742f..0057a7accfb 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -410,9 +410,7 @@ out:
pr_warning("multiple CPUs still online, may miss events.\n");
}
-/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
- but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
-static void __ref leave_uniprocessor(void)
+static void leave_uniprocessor(void)
{
int cpu;
int err;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 19d3fa08b11..a32b706c401 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -56,11 +56,11 @@ early_param("numa", numa_setup);
/*
* apicid, cpu, node mappings
*/
-s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+s16 __apicid_to_node[MAX_LOCAL_APIC] = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
-int __cpuinit numa_cpu_node(int cpu)
+int numa_cpu_node(int cpu)
{
int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map);
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-void __cpuinit numa_set_node(int cpu, int node)
+void numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
@@ -97,11 +97,10 @@ void __cpuinit numa_set_node(int cpu, int node)
#endif
per_cpu(x86_cpu_to_node_map, cpu) = node;
- if (node != NUMA_NO_NODE)
- set_cpu_numa_node(cpu, node);
+ set_cpu_numa_node(cpu, node);
}
-void __cpuinit numa_clear_node(int cpu)
+void numa_clear_node(int cpu)
{
numa_set_node(cpu, NUMA_NO_NODE);
}
@@ -115,14 +114,11 @@ void __cpuinit numa_clear_node(int cpu)
*/
void __init setup_node_to_cpumask_map(void)
{
- unsigned int node, num = 0;
+ unsigned int node;
/* setup nr_node_ids if not done yet */
- if (nr_node_ids == MAX_NUMNODES) {
- for_each_node_mask(node, node_possible_map)
- num = node;
- nr_node_ids = num + 1;
- }
+ if (nr_node_ids == MAX_NUMNODES)
+ setup_nr_node_ids();
/* allocate the map */
for (node = 0; node < nr_node_ids; node++)
@@ -141,8 +137,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
/* whine about and ignore invalid blks */
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
- pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
- nid, start, end);
+ pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+ nid, start, end - 1);
return 0;
}
@@ -193,7 +189,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
static void __init setup_node_data(int nid, u64 start, u64 end)
{
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- bool remapped = false;
u64 nd_pa;
void *nd;
int tnid;
@@ -205,37 +200,32 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
if (end && (end - start) < NODE_MIN_SIZE)
return;
- /* initialize remap allocator before aligning to ZONE_ALIGN */
- init_alloc_remap(nid, start, end);
-
start = roundup(start, ZONE_ALIGN);
- printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
- nid, start, end);
+ printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
+ nid, start, end - 1);
/*
- * Allocate node data. Try remap allocator first, node-local
- * memory and then any node. Never allocate in DMA zone.
+ * Allocate node data. Try node-local memory and then any node.
+ * Never allocate in DMA zone.
*/
- nd = alloc_remap(nid, nd_size);
- if (nd) {
- nd_pa = __pa(nd);
- remapped = true;
- } else {
- nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa) {
+ nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
+ MEMBLOCK_ALLOC_ACCESSIBLE);
if (!nd_pa) {
pr_err("Cannot find %zu bytes in node %d\n",
nd_size, nid);
return;
}
- nd = __va(nd_pa);
}
+ nd = __va(nd_pa);
/* report and initialize */
- printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
- nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
+ printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
+ nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (!remapped && tnid != nid)
+ if (tnid != nid)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
node_data[nid] = nd;
@@ -291,14 +281,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
*/
if (bi->end > bj->start && bi->start < bj->end) {
if (bi->nid != bj->nid) {
- pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
- bi->nid, bi->start, bi->end,
- bj->nid, bj->start, bj->end);
+ pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->nid, bj->start, bj->end - 1);
return -EINVAL;
}
- pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
- bi->nid, bi->start, bi->end,
- bj->start, bj->end);
+ pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->start, bj->end - 1);
}
/*
@@ -320,9 +310,9 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
}
if (k < mi->nr_blks)
continue;
- printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
- bi->nid, bi->start, bi->end, bj->start, bj->end,
- start, end);
+ printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1, bj->start,
+ bj->end - 1, start, end - 1);
bi->start = start;
bi->end = end;
numa_remove_memblk_from(j--, mi);
@@ -501,7 +491,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *mb = &mi->blk[i];
- memblock_set_node(mb->start, mb->end - mb->start, mb->nid);
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.memory, mb->nid);
}
/*
@@ -563,6 +554,41 @@ static void __init numa_init_array(void)
}
}
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+ int i, nid;
+ nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
+ unsigned long start, end;
+ struct memblock_region *r;
+
+ /*
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel. Set the nid in memblock.reserved will
+ * mark out all the nodes the kernel resides in.
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = &numa_meminfo.blk[i];
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.reserved, mb->nid);
+ }
+
+ /* Mark all kernel nodes. */
+ for_each_memblock(reserved, r)
+ node_set(r->nid, numa_kernel_nodes);
+
+ /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ nid = numa_meminfo.blk[i].nid;
+ if (!node_isset(nid, numa_kernel_nodes))
+ continue;
+
+ start = numa_meminfo.blk[i].start;
+ end = numa_meminfo.blk[i].end;
+
+ memblock_clear_hotplug(start, end - start);
+ }
+}
+
static int __init numa_init(int (*init_func)(void))
{
int i;
@@ -575,12 +601,28 @@ static int __init numa_init(int (*init_func)(void))
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
- WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
+ MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
+ MAX_NUMNODES));
+ /* In case that parsing SRAT failed. */
+ WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
numa_reset_distance();
ret = init_func();
if (ret < 0)
return ret;
+
+ /*
+ * We reset memblock back to the top-down direction
+ * here because if we configured ACPI_NUMA, we have
+ * parsed SRAT in init_func(). It is ok to have the
+ * reset here even if we did't configure ACPI_NUMA
+ * or acpi numa init fails and fallbacks to dummy
+ * numa init.
+ */
+ memblock_set_bottom_up(false);
+
ret = numa_cleanup_meminfo(&numa_meminfo);
if (ret < 0)
return ret;
@@ -600,6 +642,16 @@ static int __init numa_init(int (*init_func)(void))
numa_clear_node(i);
}
numa_init_array();
+
+ /*
+ * At very early time, the kernel have to use some memory such as
+ * loading the kernel image. We cannot prevent this anyway. So any
+ * node the kernel resides in should be un-hotpluggable.
+ *
+ * And when we come here, numa_init() won't fail.
+ */
+ numa_clear_kernel_node_hotplug();
+
return 0;
}
@@ -616,8 +668,8 @@ static int __init dummy_numa_init(void)
{
printk(KERN_INFO "%s\n",
numa_off ? "NUMA turned off" : "No NUMA configuration found");
- printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
- 0LLU, PFN_PHYS(max_pfn));
+ printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
+ 0LLU, PFN_PHYS(max_pfn) - 1);
node_set(0, numa_nodes_parsed);
numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
@@ -635,10 +687,6 @@ static int __init dummy_numa_init(void)
void __init x86_numa_init(void)
{
if (!numa_off) {
-#ifdef CONFIG_X86_NUMAQ
- if (!numa_init(numaq_numa_init))
- return;
-#endif
#ifdef CONFIG_ACPI_NUMA
if (!numa_init(x86_acpi_numa_init))
return;
@@ -705,12 +753,12 @@ void __init init_cpu_to_node(void)
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
# ifndef CONFIG_NUMA_EMU
-void __cpuinit numa_add_cpu(int cpu)
+void numa_add_cpu(int cpu)
{
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
-void __cpuinit numa_remove_cpu(int cpu)
+void numa_remove_cpu(int cpu)
{
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
@@ -777,17 +825,17 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable)
}
# ifndef CONFIG_NUMA_EMU
-static void __cpuinit numa_set_cpumask(int cpu, bool enable)
+static void numa_set_cpumask(int cpu, bool enable)
{
debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
}
-void __cpuinit numa_add_cpu(int cpu)
+void numa_add_cpu(int cpu)
{
numa_set_cpumask(cpu, true);
}
-void __cpuinit numa_remove_cpu(int cpu)
+void numa_remove_cpu(int cpu)
{
numa_set_cpumask(cpu, false);
}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 534255a36b6..47b6436e41c 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
+ start = round_down(start, PAGES_PER_SECTION);
+ end = round_up(end, PAGES_PER_SECTION);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
physnode_map[pfn / PAGES_PER_SECTION] = nid;
printk(KERN_CONT "%lx ", pfn);
@@ -73,167 +75,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
extern unsigned long highend_pfn, highstart_pfn;
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-static void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-/*
- * Remap memory allocator
- */
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-
-/**
- * alloc_remap - Allocate remapped memory
- * @nid: NUMA node to allocate memory from
- * @size: The size of allocation
- *
- * Allocate @size bytes from the remap area of NUMA node @nid. The
- * size of the remap area is predetermined by init_alloc_remap() and
- * only the callers considered there should call this function. For
- * more info, please read the comment on top of init_alloc_remap().
- *
- * The caller must be ready to handle allocation failure from this
- * function and fall back to regular memory allocator in such cases.
- *
- * CONTEXT:
- * Single CPU early boot context.
- *
- * RETURNS:
- * Pointer to the allocated memory on success, %NULL on failure.
- */
-void *alloc_remap(int nid, unsigned long size)
-{
- void *allocation = node_remap_alloc_vaddr[nid];
-
- size = ALIGN(size, L1_CACHE_BYTES);
-
- if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
- return NULL;
-
- node_remap_alloc_vaddr[nid] += size;
- memset(allocation, 0, size);
-
- return allocation;
-}
-
-#ifdef CONFIG_HIBERNATION
-/**
- * resume_map_numa_kva - add KVA mapping to the temporary page tables created
- * during resume from hibernation
- * @pgd_base - temporary resume page directory
- */
-void resume_map_numa_kva(pgd_t *pgd_base)
-{
- int node;
-
- for_each_online_node(node) {
- unsigned long start_va, start_pfn, nr_pages, pfn;
-
- start_va = (unsigned long)node_remap_start_vaddr[node];
- start_pfn = node_remap_start_pfn[node];
- nr_pages = (node_remap_end_vaddr[node] -
- node_remap_start_vaddr[node]) >> PAGE_SHIFT;
-
- printk(KERN_DEBUG "%s: node %d\n", __func__, node);
-
- for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
- unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
- pgd_t *pgd = pgd_base + pgd_index(vaddr);
- pud_t *pud = pud_offset(pgd, vaddr);
- pmd_t *pmd = pmd_offset(pud, vaddr);
-
- set_pmd(pmd, pfn_pmd(start_pfn + pfn,
- PAGE_KERNEL_LARGE_EXEC));
-
- printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
- __func__, vaddr, start_pfn + pfn);
- }
- }
-}
-#endif
-
-/**
- * init_alloc_remap - Initialize remap allocator for a NUMA node
- * @nid: NUMA node to initizlie remap allocator for
- *
- * NUMA nodes may end up without any lowmem. As allocating pgdat and
- * memmap on a different node with lowmem is inefficient, a special
- * remap allocator is implemented which can be used by alloc_remap().
- *
- * For each node, the amount of memory which will be necessary for
- * pgdat and memmap is calculated and two memory areas of the size are
- * allocated - one in the node and the other in lowmem; then, the area
- * in the node is remapped to the lowmem area.
- *
- * As pgdat and memmap must be allocated in lowmem anyway, this
- * doesn't waste lowmem address space; however, the actual lowmem
- * which gets remapped over is wasted. The amount shouldn't be
- * problematic on machines this feature will be used.
- *
- * Initialization failure isn't fatal. alloc_remap() is used
- * opportunistically and the callers will fall back to other memory
- * allocation mechanisms on failure.
- */
-void __init init_alloc_remap(int nid, u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long end_pfn = end >> PAGE_SHIFT;
- unsigned long size, pfn;
- u64 node_pa, remap_pa;
- void *remap_va;
-
- /*
- * The acpi/srat node info can show hot-add memroy zones where
- * memory could be added but not currently present.
- */
- printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
- nid, start_pfn, end_pfn);
-
- /* calculate the necessary space aligned to large page size */
- size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
- size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
- size = ALIGN(size, LARGE_PAGE_BYTES);
-
- /* allocate node memory and the lowmem remap area */
- node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
- if (!node_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
- size, nid);
- return;
- }
- memblock_reserve(node_pa, size);
-
- remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
- max_low_pfn << PAGE_SHIFT,
- size, LARGE_PAGE_BYTES);
- if (!remap_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
- size, nid);
- memblock_free(node_pa, size);
- return;
- }
- memblock_reserve(remap_pa, size);
- remap_va = phys_to_virt(remap_pa);
-
- /* perform actual remap */
- for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
- set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
- (node_pa >> PAGE_SHIFT) + pfn,
- PAGE_KERNEL_LARGE);
-
- /* initialize remap allocator parameters */
- node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
- node_remap_start_vaddr[nid] = remap_va;
- node_remap_end_vaddr[nid] = remap_va + size;
- node_remap_alloc_vaddr[nid] = remap_va;
-
- printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
- nid, node_pa, node_pa + size, remap_va, remap_va + size);
-}
-
void __init initmem_init(void)
{
x86_numa_init();
@@ -244,10 +85,8 @@ void __init initmem_init(void)
highstart_pfn = max_low_pfn;
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
- num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 92e27119ee1..9405ffc9150 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -10,16 +10,3 @@ void __init initmem_init(void)
{
x86_numa_init();
}
-
-unsigned long __init numa_free_all_bootmem(void)
-{
- unsigned long pages = 0;
- int i;
-
- for_each_online_node(i)
- pages += free_all_bootmem_node(NODE_DATA(i));
-
- pages += free_low_memory_core_early(MAX_NUMNODES);
-
- return pages;
-}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 46db56845f1..a8f90ce3ded 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -10,7 +10,7 @@
#include "numa_internal.h"
-static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
+static int emu_nid_to_phys[MAX_NUMNODES];
static char *emu_cmdline __initdata;
void __init numa_emu_cmdline(char *str)
@@ -28,7 +28,7 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
return -ENOENT;
}
-static u64 mem_hole_size(u64 start, u64 end)
+static u64 __init mem_hole_size(u64 start, u64 end)
{
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = PFN_DOWN(end);
@@ -60,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
eb->nid = nid;
if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
- emu_nid_to_phys[nid] = pb->nid;
+ emu_nid_to_phys[nid] = nid;
pb->start += size;
if (pb->start >= pb->end) {
@@ -68,8 +68,8 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
numa_remove_memblk_from(phys_blk, pi);
}
- printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
- eb->start, eb->end, (eb->end - eb->start) >> 20);
+ printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+ nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
return 0;
}
@@ -339,9 +339,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
} else {
unsigned long n;
- n = simple_strtoul(emu_cmdline, NULL, 0);
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
}
+ if (*emu_cmdline == ':')
+ emu_cmdline++;
if (ret < 0)
goto no_emu;
@@ -418,7 +420,9 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
int physj = emu_nid_to_phys[j];
int dist;
- if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+ if (get_option(&emu_cmdline, &dist) == 2)
+ ;
+ else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
dist = physi == physj ?
LOCAL_DISTANCE : REMOTE_DISTANCE;
else
@@ -440,7 +444,7 @@ no_emu:
}
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-void __cpuinit numa_add_cpu(int cpu)
+void numa_add_cpu(int cpu)
{
int physnid, nid;
@@ -458,7 +462,7 @@ void __cpuinit numa_add_cpu(int cpu)
cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
}
-void __cpuinit numa_remove_cpu(int cpu)
+void numa_remove_cpu(int cpu)
{
int i;
@@ -466,7 +470,7 @@ void __cpuinit numa_remove_cpu(int cpu)
cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
}
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
-static void __cpuinit numa_set_cpumask(int cpu, bool enable)
+static void numa_set_cpumask(int cpu, bool enable)
{
int nid, physnid;
@@ -486,12 +490,12 @@ static void __cpuinit numa_set_cpumask(int cpu, bool enable)
}
}
-void __cpuinit numa_add_cpu(int cpu)
+void numa_add_cpu(int cpu)
{
numa_set_cpumask(cpu, true);
}
-void __cpuinit numa_remove_cpu(int cpu)
+void numa_remove_cpu(int cpu)
{
numa_set_cpumask(cpu, false);
}
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 7178c3afe05..ad86ec91e64 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
void __init x86_numa_init(void);
-#ifdef CONFIG_X86_64
-static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
-#else
-void __init init_alloc_remap(int nid, u64 start, u64 end);
-#endif
-
#ifdef CONFIG_NUMA_EMU
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
int numa_dist_cnt);
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index b0086567271..6629f397b46 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -8,7 +8,6 @@
#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/mm.h>
#include <asm/cacheflush.h>
@@ -36,7 +35,7 @@ enum {
static int pte_testbit(pte_t pte)
{
- return pte_flags(pte) & _PAGE_UNUSED1;
+ return pte_flags(pte) & _PAGE_SOFTW1;
}
struct split_state {
@@ -68,7 +67,7 @@ static int print_split(struct split_state *s)
s->gpg++;
i += GPS/PAGE_SIZE;
} else if (level == PG_LEVEL_2M) {
- if (!(pte_val(*pte) & _PAGE_PSE)) {
+ if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) {
printk(KERN_ERR
"%lx level %d but not PSE %Lx\n",
addr, level, (u64)pte_val(*pte));
@@ -130,13 +129,12 @@ static int pageattr_test(void)
}
failed += print_split(&sa);
- srandom32(100);
for (i = 0; i < NTEST; i++) {
- unsigned long pfn = random32() % max_pfn_mapped;
+ unsigned long pfn = prandom_u32() % max_pfn_mapped;
addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
- len[i] = random32() % 100;
+ len[i] = prandom_u32() % 100;
len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
if (len[i] == 0)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e1ebde31521..ae242a7c11c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -30,6 +30,7 @@
*/
struct cpa_data {
unsigned long *vaddr;
+ pgd_t *pgd;
pgprot_t mask_set;
pgprot_t mask_clr;
int numpages;
@@ -94,12 +95,12 @@ static inline void split_page_count(int level) { }
static inline unsigned long highmap_start_pfn(void)
{
- return __pa(_text) >> PAGE_SHIFT;
+ return __pa_symbol(_text) >> PAGE_SHIFT;
}
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -122,11 +123,11 @@ within(unsigned long addr, unsigned long start, unsigned long end)
/**
* clflush_cache_range - flush a cache range with clflush
- * @addr: virtual start address
+ * @vaddr: virtual start address
* @size: number of bytes to flush
*
- * clflush is an unordered instruction which needs fencing with mfence
- * to avoid ordering issues.
+ * clflushopt is an unordered instruction which needs fencing with mfence or
+ * sfence to avoid ordering issues.
*/
void clflush_cache_range(void *vaddr, unsigned int size)
{
@@ -135,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size)
mb();
for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
- clflush(vaddr);
+ clflushopt(vaddr);
/*
* Flush any possible final partial cacheline:
*/
- clflush(vend);
+ clflushopt(vend);
mb();
}
@@ -276,8 +277,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
* The .rodata section needs to be read-only. Using the pfn
* catches all aliases.
*/
- if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
- __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
+ if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
+ __pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
@@ -323,16 +324,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
}
/*
- * Lookup the page table entry for a virtual address. Return a pointer
- * to the entry and the level of the mapping.
- *
- * Note: We return pud and pmd either when the entry is marked large
- * or when the present bit is not set. Otherwise we would return a
- * pointer to a nonexisting mapping.
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
*/
-pte_t *lookup_address(unsigned long address, unsigned int *level)
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
{
- pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
pmd_t *pmd;
@@ -361,8 +358,62 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
return pte_offset_kernel(pmd, address);
}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+ return lookup_address_in_pgd(pgd_offset_k(address), address, level);
+}
EXPORT_SYMBOL_GPL(lookup_address);
+static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
+ unsigned int *level)
+{
+ if (cpa->pgd)
+ return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
+ address, level);
+
+ return lookup_address(address, level);
+}
+
+/*
+ * This is necessary because __pa() does not work on some
+ * kinds of memory, like vmalloc() or the alloc_remap()
+ * areas on 32-bit NUMA systems. The percpu areas can
+ * end up in this kind of memory, for instance.
+ *
+ * This could be optimized, but it is only intended to be
+ * used at inititalization time, and keeping it
+ * unoptimized should increase the testing coverage for
+ * the more obscure platforms.
+ */
+phys_addr_t slow_virt_to_phys(void *__virt_addr)
+{
+ unsigned long virt_addr = (unsigned long)__virt_addr;
+ phys_addr_t phys_addr;
+ unsigned long offset;
+ enum pg_level level;
+ unsigned long psize;
+ unsigned long pmask;
+ pte_t *pte;
+
+ pte = lookup_address(virt_addr, &level);
+ BUG_ON(!pte);
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ offset = virt_addr & ~pmask;
+ phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ return (phys_addr | offset);
+}
+EXPORT_SYMBOL_GPL(slow_virt_to_phys);
+
/*
* Set the new pmd in all the pgds we know about:
*/
@@ -396,7 +447,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
- unsigned int level;
+ enum pg_level level;
if (cpa->force_split)
return 1;
@@ -406,21 +457,18 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = lookup_address(address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte)
goto out_unlock;
switch (level) {
case PG_LEVEL_2M:
- psize = PMD_PAGE_SIZE;
- pmask = PMD_PAGE_MASK;
- break;
#ifdef CONFIG_X86_64
case PG_LEVEL_1G:
- psize = PUD_PAGE_SIZE;
- pmask = PUD_PAGE_MASK;
- break;
#endif
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ break;
default:
do_split = -EINVAL;
goto out_unlock;
@@ -439,12 +487,25 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* We are safe now. Check whether the new pgprot is the same:
*/
old_pte = *kpte;
- old_prot = new_prot = req_prot = pte_pgprot(old_pte);
+ old_prot = req_prot = pte_pgprot(old_pte);
pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
/*
+ * Set the PSE and GLOBAL flags only if the PRESENT flag is
+ * set otherwise pmd_present/pmd_huge will return true even on
+ * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
+ * for the ancient hardware that doesn't support it.
+ */
+ if (pgprot_val(req_prot) & _PAGE_PRESENT)
+ pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
+ else
+ pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
+
+ req_prot = canon_pgprot(req_prot);
+
+ /*
* old_pte points to the large page base address. So we need
* to add the offset of the virtual address:
*/
@@ -489,7 +550,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* The address is aligned and the number of pages
* covers the full page.
*/
- new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
+ new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
@@ -501,32 +562,27 @@ out_unlock:
return do_split;
}
-static int split_large_page(pte_t *kpte, unsigned long address)
+static int
+__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
+ struct page *base)
{
+ pte_t *pbase = (pte_t *)page_address(base);
unsigned long pfn, pfninc = 1;
unsigned int i, level;
- pte_t *pbase, *tmp;
+ pte_t *tmp;
pgprot_t ref_prot;
- struct page *base;
-
- if (!debug_pagealloc)
- spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
- if (!debug_pagealloc)
- spin_lock(&cpa_lock);
- if (!base)
- return -ENOMEM;
spin_lock(&pgd_lock);
/*
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = lookup_address(address, &level);
- if (tmp != kpte)
- goto out_unlock;
+ tmp = _lookup_address_cpa(cpa, address, &level);
+ if (tmp != kpte) {
+ spin_unlock(&pgd_lock);
+ return 1;
+ }
- pbase = (pte_t *)page_address(base);
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
/*
@@ -540,27 +596,40 @@ static int split_large_page(pte_t *kpte, unsigned long address)
#ifdef CONFIG_X86_64
if (level == PG_LEVEL_1G) {
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
- pgprot_val(ref_prot) |= _PAGE_PSE;
+ /*
+ * Set the PSE flags only if the PRESENT flag is set
+ * otherwise pmd_present/pmd_huge will return true
+ * even on a non present pmd.
+ */
+ if (pgprot_val(ref_prot) & _PAGE_PRESENT)
+ pgprot_val(ref_prot) |= _PAGE_PSE;
+ else
+ pgprot_val(ref_prot) &= ~_PAGE_PSE;
}
#endif
/*
+ * Set the GLOBAL flags only if the PRESENT flag is set
+ * otherwise pmd/pte_present will return true even on a non
+ * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
+ * for the ancient hardware that doesn't support it.
+ */
+ if (pgprot_val(ref_prot) & _PAGE_PRESENT)
+ pgprot_val(ref_prot) |= _PAGE_GLOBAL;
+ else
+ pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
+
+ /*
* Get the target pfn from the original entry:
*/
pfn = pte_pfn(*kpte);
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
- set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
+ set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
- if (address >= (unsigned long)__va(0) &&
- address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+ if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
+ PFN_DOWN(__pa(address)) + 1))
split_page_count(level);
-#ifdef CONFIG_X86_64
- if (address >= (unsigned long)__va(1UL<<32) &&
- address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
- split_page_count(level);
-#endif
-
/*
* Install the new, split up pagetable.
*
@@ -579,24 +648,420 @@ static int split_large_page(pte_t *kpte, unsigned long address)
* going on.
*/
__flush_tlb_all();
+ spin_unlock(&pgd_lock);
- base = NULL;
+ return 0;
+}
+
+static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+ unsigned long address)
+{
+ struct page *base;
+
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
+ base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
+ if (!base)
+ return -ENOMEM;
+
+ if (__split_large_page(cpa, kpte, address, base))
+ __free_page(base);
+
+ return 0;
+}
+
+static bool try_to_free_pte_page(pte_t *pte)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ if (!pte_none(pte[i]))
+ return false;
+
+ free_page((unsigned long)pte);
+ return true;
+}
+
+static bool try_to_free_pmd_page(pmd_t *pmd)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ if (!pmd_none(pmd[i]))
+ return false;
+
+ free_page((unsigned long)pmd);
+ return true;
+}
+
+static bool try_to_free_pud_page(pud_t *pud)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ if (!pud_none(pud[i]))
+ return false;
+
+ free_page((unsigned long)pud);
+ return true;
+}
+
+static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+{
+ pte_t *pte = pte_offset_kernel(pmd, start);
+
+ while (start < end) {
+ set_pte(pte, __pte(0));
+
+ start += PAGE_SIZE;
+ pte++;
+ }
+
+ if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+ pmd_clear(pmd);
+ return true;
+ }
+ return false;
+}
+
+static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+ unsigned long start, unsigned long end)
+{
+ if (unmap_pte_range(pmd, start, end))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, start);
-out_unlock:
/*
- * If we dropped out via the lookup_address check under
- * pgd_lock then stick the page back into the pool:
+ * Not on a 2MB page boundary?
*/
- if (base)
- __free_page(base);
- spin_unlock(&pgd_lock);
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ __unmap_pmd_range(pud, pmd, start, pre_end);
+
+ start = pre_end;
+ pmd++;
+ }
+
+ /*
+ * Try to unmap in 2M chunks.
+ */
+ while (end - start >= PMD_SIZE) {
+ if (pmd_large(*pmd))
+ pmd_clear(pmd);
+ else
+ __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+
+ start += PMD_SIZE;
+ pmd++;
+ }
+
+ /*
+ * 4K leftovers?
+ */
+ if (start < end)
+ return __unmap_pmd_range(pud, pmd, start, end);
+
+ /*
+ * Try again to free the PMD page if haven't succeeded above.
+ */
+ if (!pud_none(*pud))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ pud_t *pud = pud_offset(pgd, start);
+
+ /*
+ * Not on a GB page boundary?
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ unmap_pmd_range(pud, start, pre_end);
+ start = pre_end;
+ pud++;
+ }
+
+ /*
+ * Try to unmap in 1G chunks?
+ */
+ while (end - start >= PUD_SIZE) {
+
+ if (pud_large(*pud))
+ pud_clear(pud);
+ else
+ unmap_pmd_range(pud, start, start + PUD_SIZE);
+
+ start += PUD_SIZE;
+ pud++;
+ }
+
+ /*
+ * 2M leftovers?
+ */
+ if (start < end)
+ unmap_pmd_range(pud, start, end);
+
+ /*
+ * No need to try to free the PUD page because we'll free it in
+ * populate_pgd's error path
+ */
+}
+
+static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
+{
+ pgd_t *pgd_entry = root + pgd_index(addr);
+
+ unmap_pud_range(pgd_entry, addr, end);
+
+ if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
+ pgd_clear(pgd_entry);
+}
+
+static int alloc_pte_page(pmd_t *pmd)
+{
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pte)
+ return -1;
+
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ return 0;
+}
+
+static int alloc_pmd_page(pud_t *pud)
+{
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pmd)
+ return -1;
+
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ return 0;
+}
+
+static void populate_pte(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, start);
+
+ while (num_pages-- && start < end) {
+
+ /* deal with the NX bit */
+ if (!(pgprot_val(pgprot) & _PAGE_NX))
+ cpa->pfn &= ~_PAGE_NX;
+
+ set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+
+ start += PAGE_SIZE;
+ cpa->pfn += PAGE_SIZE;
+ pte++;
+ }
+}
+
+static int populate_pmd(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+{
+ unsigned int cur_pages = 0;
+ pmd_t *pmd;
+
+ /*
+ * Not on a 2M boundary?
+ */
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+
+ pre_end = min_t(unsigned long, pre_end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(unsigned int, num_pages, cur_pages);
+
+ /*
+ * Need a PTE page?
+ */
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
+
+ start = pre_end;
+ }
+
+ /*
+ * We mapped them all?
+ */
+ if (num_pages == cur_pages)
+ return cur_pages;
+
+ while (end - start >= PMD_SIZE) {
+
+ /*
+ * We cannot use a 1G page so allocate a PMD page if needed.
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ pmd = pmd_offset(pud, start);
+
+ set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+ start += PMD_SIZE;
+ cpa->pfn += PMD_SIZE;
+ cur_pages += PMD_SIZE >> PAGE_SHIFT;
+ }
+
+ /*
+ * Map trailing 4K pages.
+ */
+ if (start < end) {
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, end, num_pages - cur_pages,
+ pmd, pgprot);
+ }
+ return num_pages;
+}
+
+static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
+ pgprot_t pgprot)
+{
+ pud_t *pud;
+ unsigned long end;
+ int cur_pages = 0;
+
+ end = start + (cpa->numpages << PAGE_SHIFT);
+
+ /*
+ * Not on a Gb page boundary? => map everything up to it with
+ * smaller pages.
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long pre_end;
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+
+ pre_end = min_t(unsigned long, end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
+
+ pud = pud_offset(pgd, start);
+
+ /*
+ * Need a PMD page?
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
+ pud, pgprot);
+ if (cur_pages < 0)
+ return cur_pages;
+
+ start = pre_end;
+ }
+
+ /* We mapped them all? */
+ if (cpa->numpages == cur_pages)
+ return cur_pages;
+
+ pud = pud_offset(pgd, start);
+
+ /*
+ * Map everything starting from the Gb boundary, possibly with 1G pages
+ */
+ while (end - start >= PUD_SIZE) {
+ set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+ start += PUD_SIZE;
+ cpa->pfn += PUD_SIZE;
+ cur_pages += PUD_SIZE >> PAGE_SHIFT;
+ pud++;
+ }
+
+ /* Map trailing leftover */
+ if (start < end) {
+ int tmp;
+
+ pud = pud_offset(pgd, start);
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
+ pud, pgprot);
+ if (tmp < 0)
+ return cur_pages;
+
+ cur_pages += tmp;
+ }
+ return cur_pages;
+}
+
+/*
+ * Restrictions for kernel page table do not necessarily apply when mapping in
+ * an alternate PGD.
+ */
+static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
+{
+ pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
+ pud_t *pud = NULL; /* shut up gcc */
+ pgd_t *pgd_entry;
+ int ret;
+
+ pgd_entry = cpa->pgd + pgd_index(addr);
+
+ /*
+ * Allocate a PUD page and hand it down for mapping.
+ */
+ if (pgd_none(*pgd_entry)) {
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pud)
+ return -1;
+
+ set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
+
+ ret = populate_pud(cpa, addr, pgd_entry, pgprot);
+ if (ret < 0) {
+ unmap_pgd_range(cpa->pgd, addr,
+ addr + (cpa->numpages << PAGE_SHIFT));
+ return ret;
+ }
+
+ cpa->numpages = ret;
return 0;
}
static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
int primary)
{
+ if (cpa->pgd)
+ return populate_pgd(cpa, vaddr);
+
/*
* Ignore all non primary paths.
*/
@@ -641,7 +1106,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
else
address = *cpa->vaddr;
repeat:
- kpte = lookup_address(address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -660,6 +1125,18 @@ repeat:
new_prot = static_protections(new_prot, address, pfn);
/*
+ * Set the GLOBAL flags only if the PRESENT flag is
+ * set otherwise pte_present will return true even on
+ * a non present pte. The canon_pgprot will clear
+ * _PAGE_GLOBAL for the ancient hardware that doesn't
+ * support it.
+ */
+ if (pgprot_val(new_prot) & _PAGE_PRESENT)
+ pgprot_val(new_prot) |= _PAGE_GLOBAL;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
+
+ /*
* We need to keep the pfn from the existing PTE,
* after all we're only going to change it's attributes
* not the memory it points to
@@ -693,7 +1170,7 @@ repeat:
/*
* We have to split the large page:
*/
- err = split_large_page(kpte, address);
+ err = split_large_page(cpa, kpte, address);
if (!err) {
/*
* Do a global flush tlb after splitting the large page
@@ -729,13 +1206,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
unsigned long vaddr;
int ret;
- if (cpa->pfn >= max_pfn_mapped)
+ if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
return 0;
-#ifdef CONFIG_X86_64
- if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
- return 0;
-#endif
/*
* No need to redo, when the primary call touched the direct
* mapping already:
@@ -846,6 +1319,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
int ret, cache, checkalias;
unsigned long baddr = 0;
+ memset(&cpa, 0, sizeof(cpa));
+
/*
* Check, if we are requested to change a not supported
* feature:
@@ -918,10 +1393,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cache = cache_attr(mask_set);
/*
- * On success we use clflush, when the CPU supports it to
- * avoid the wbindv. If the CPU does not support it and in the
+ * On success we use CLFLUSH, when the CPU supports it to
+ * avoid the WBINVD. If the CPU does not support it and in the
* error case we fall back to cpa_flush_all (which uses
- * wbindv):
+ * WBINVD):
*/
if (!ret && cpu_has_clflush) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
@@ -1292,6 +1767,7 @@ static int __set_pages_p(struct page *page, int numpages)
{
unsigned long tempaddr = (unsigned long) page_address(page);
struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.mask_clr = __pgprot(0),
@@ -1310,6 +1786,7 @@ static int __set_pages_np(struct page *page, int numpages)
{
unsigned long tempaddr = (unsigned long) page_address(page);
struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
@@ -1348,6 +1825,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
* but that can deadlock->flush only current cpu:
*/
__flush_tlb_all();
+
+ arch_flush_lazy_mmu_mode();
}
#ifdef CONFIG_HIBERNATION
@@ -1368,6 +1847,42 @@ bool kernel_page_present(struct page *page)
#endif /* CONFIG_DEBUG_PAGEALLOC */
+int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+ unsigned numpages, unsigned long page_flags)
+{
+ int retval = -EINVAL;
+
+ struct cpa_data cpa = {
+ .vaddr = &address,
+ .pfn = pfn,
+ .pgd = pgd,
+ .numpages = numpages,
+ .mask_set = __pgprot(0),
+ .mask_clr = __pgprot(0),
+ .flags = 0,
+ };
+
+ if (!(__supported_pte_mask & _PAGE_NX))
+ goto out;
+
+ if (!(page_flags & _PAGE_NX))
+ cpa.mask_clr = __pgprot(_PAGE_NX);
+
+ cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
+
+ retval = __change_page_attr_set_clr(&cpa, 0);
+ __flush_tlb_all();
+
+out:
+ return retval;
+}
+
+void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
+ unsigned numpages)
+{
+ unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
+}
+
/*
* The testcases use internal knowledge of the implementation that shouldn't
* be exposed to the rest of the kernel. Include these directly here.
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b7efa..657438858e8 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,31 +158,47 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
return req_type;
}
+struct pagerange_state {
+ unsigned long cur_pfn;
+ int ram;
+ int not_ram;
+};
+
+static int
+pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
+{
+ struct pagerange_state *state = arg;
+
+ state->not_ram |= initial_pfn > state->cur_pfn;
+ state->ram |= total_nr_pages > 0;
+ state->cur_pfn = initial_pfn + total_nr_pages;
+
+ return state->ram && state->not_ram;
+}
+
static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
{
- int ram_page = 0, not_rampage = 0;
- unsigned long page_nr;
+ int ret = 0;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ struct pagerange_state state = {start_pfn, 0, 0};
- for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
- ++page_nr) {
- /*
- * For legacy reasons, physical address range in the legacy ISA
- * region is tracked as non-RAM. This will allow users of
- * /dev/mem to map portions of legacy ISA region, even when
- * some of those portions are listed(or not even listed) with
- * different e820 types(RAM/reserved/..)
- */
- if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
- page_is_ram(page_nr))
- ram_page = 1;
- else
- not_rampage = 1;
-
- if (ram_page == not_rampage)
- return -1;
+ /*
+ * For legacy reasons, physical address range in the legacy ISA
+ * region is tracked as non-RAM. This will allow users of
+ * /dev/mem to map portions of legacy ISA region, even when
+ * some of those portions are listed(or not even listed) with
+ * different e820 types(RAM/reserved/..)
+ */
+ if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
+ start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
+
+ if (start_pfn < end_pfn) {
+ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ &state, pagerange_is_ram_callback);
}
- return ram_page;
+ return (ret > 0) ? -1 : (state.ram ? 1 : 0);
}
/*
@@ -209,9 +225,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
page = pfn_to_page(pfn);
type = get_page_memtype(page);
if (type != -1) {
- printk(KERN_INFO "reserve_ram_pages_type failed "
- "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
- start, end, type, req_type);
+ printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n",
+ start, end - 1, type, req_type);
if (new_type)
*new_type = type;
@@ -314,9 +329,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
err = rbt_memtype_check_insert(new, new_type);
if (err) {
- printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
- "track %s, req %s\n",
- start, end, cattr_name(new->type), cattr_name(req_type));
+ printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+ start, end - 1,
+ cattr_name(new->type), cattr_name(req_type));
kfree(new);
spin_unlock(&memtype_lock);
@@ -325,8 +340,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
spin_unlock(&memtype_lock);
- dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
- start, end, cattr_name(new->type), cattr_name(req_type),
+ dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+ start, end - 1, cattr_name(new->type), cattr_name(req_type),
new_type ? cattr_name(*new_type) : "-");
return err;
@@ -360,14 +375,14 @@ int free_memtype(u64 start, u64 end)
spin_unlock(&memtype_lock);
if (!entry) {
- printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
- current->comm, current->pid, start, end);
+ printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+ current->comm, current->pid, start, end - 1);
return -EINVAL;
}
kfree(entry);
- dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+ dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
return 0;
}
@@ -491,9 +506,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
while (cursor < to) {
if (!devmem_is_allowed(pfn)) {
- printk(KERN_INFO
- "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
- current->comm, from, to);
+ printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
+ current->comm, from, to - 1);
return 0;
}
cursor += PAGE_SIZE;
@@ -546,20 +560,26 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
- if (base >= __pa(high_memory))
+ if (base > __pa(high_memory-1))
return 0;
- id_sz = (__pa(high_memory) < base + size) ?
+ /*
+ * some areas in the middle of the kernel identity range
+ * are not mapped, like the PCI space.
+ */
+ if (!page_is_ram(base >> PAGE_SHIFT))
+ return 0;
+
+ id_sz = (__pa(high_memory-1) <= base + size) ?
__pa(high_memory) - base :
size;
if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
- printk(KERN_INFO
- "%s:%d ioremap_change_attr failed %s "
- "for %Lx-%Lx\n",
+ printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
+ "for [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid,
cattr_name(flags),
- base, (unsigned long long)(base + size));
+ base, (unsigned long long)(base + size-1));
return -EINVAL;
}
return 0;
@@ -591,12 +611,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
flags = lookup_memtype(paddr);
if (want_flags != flags) {
- printk(KERN_WARNING
- "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+ printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_flags),
(unsigned long long)paddr,
- (unsigned long long)(paddr + size),
+ (unsigned long long)(paddr + size - 1),
cattr_name(flags));
*vma_prot = __pgprot((pgprot_val(*vma_prot) &
(~_PAGE_CACHE_MASK)) |
@@ -614,11 +633,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
!is_new_memtype_allowed(paddr, size, want_flags, flags)) {
free_memtype(paddr, paddr + size);
printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
- " for %Lx-%Lx, got %s\n",
+ " for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_flags),
(unsigned long long)paddr,
- (unsigned long long)(paddr + size),
+ (unsigned long long)(paddr + size - 1),
cattr_name(flags));
return -EINVAL;
}
@@ -652,20 +671,20 @@ static void free_pfn_range(u64 paddr, unsigned long size)
}
/*
- * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * track_pfn_copy is called when vma that is covering the pfnmap gets
* copied through copy_page_range().
*
* If the vma has a linear pfn mapping for the entire range, we get the prot
* from pte and reserve the entire vma range with single reserve_pfn_range call.
*/
-int track_pfn_vma_copy(struct vm_area_struct *vma)
+int track_pfn_copy(struct vm_area_struct *vma)
{
resource_size_t paddr;
unsigned long prot;
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
- if (is_linear_pfn_mapping(vma)) {
+ if (vma->vm_flags & VM_PAT) {
/*
* reserve the whole chunk covered by vma. We need the
* starting address and protection from pte.
@@ -682,31 +701,59 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
}
/*
- * track_pfn_vma_new is called when a _new_ pfn mapping is being established
- * for physical range indicated by pfn and size.
- *
* prot is passed in as a parameter for the new mapping. If the vma has a
* linear pfn mapping for the entire range reserve the entire vma range with
* single reserve_pfn_range call.
*/
-int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
- unsigned long pfn, unsigned long size)
+int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+ unsigned long pfn, unsigned long addr, unsigned long size)
{
+ resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
unsigned long flags;
- resource_size_t paddr;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
- if (is_linear_pfn_mapping(vma)) {
- /* reserve the whole chunk starting from vm_pgoff */
- paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
- return reserve_pfn_range(paddr, vma_size, prot, 0);
+ /* reserve the whole chunk starting from paddr */
+ if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+ int ret;
+
+ ret = reserve_pfn_range(paddr, size, prot, 0);
+ if (!ret)
+ vma->vm_flags |= VM_PAT;
+ return ret;
}
if (!pat_enabled)
return 0;
- /* for vm_insert_pfn and friends, we set prot based on lookup */
- flags = lookup_memtype(pfn << PAGE_SHIFT);
+ /*
+ * For anything smaller than the vma size we set prot based on the
+ * lookup.
+ */
+ flags = lookup_memtype(paddr);
+
+ /* Check memtype for the remaining pages */
+ while (size > PAGE_SIZE) {
+ size -= PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ if (flags != lookup_memtype(paddr))
+ return -EINVAL;
+ }
+
+ *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+ flags);
+
+ return 0;
+}
+
+int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+ unsigned long pfn)
+{
+ unsigned long flags;
+
+ if (!pat_enabled)
+ return 0;
+
+ /* Set prot based on lookup */
+ flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
flags);
@@ -714,22 +761,31 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
}
/*
- * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack_pfn is called while unmapping a pfnmap for a region.
* untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case size can be zero).
+ * can be for the entire vma (in which case pfn, size are zero).
*/
-void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
- unsigned long size)
+void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size)
{
resource_size_t paddr;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
+ unsigned long prot;
- if (is_linear_pfn_mapping(vma)) {
- /* free the whole chunk starting from vm_pgoff */
- paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
- free_pfn_range(paddr, vma_size);
+ if (!(vma->vm_flags & VM_PAT))
return;
+
+ /* free the chunk starting from pfn or the whole chunk */
+ paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ if (!paddr && !size) {
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ size = vma->vm_end - vma->vm_start;
}
+ free_pfn_range(paddr, size);
+ vma->vm_flags &= ~VM_PAT;
}
pgprot_t pgprot_writecombine(pgprot_t prot)
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 8acaddd0fb2..415f6c4ced3 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -12,7 +12,7 @@
#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
#include <linux/sched.h>
#include <linux/gfp.h>
@@ -54,29 +54,24 @@ static u64 get_subtree_max_end(struct rb_node *node)
return ret;
}
-/* Update 'subtree_max_end' for a node, based on node and its children */
-static void memtype_rb_augment_cb(struct rb_node *node, void *__unused)
+static u64 compute_subtree_max_end(struct memtype *data)
{
- struct memtype *data;
- u64 max_end, child_max_end;
-
- if (!node)
- return;
+ u64 max_end = data->end, child_max_end;
- data = container_of(node, struct memtype, rb);
- max_end = data->end;
-
- child_max_end = get_subtree_max_end(node->rb_right);
+ child_max_end = get_subtree_max_end(data->rb.rb_right);
if (child_max_end > max_end)
max_end = child_max_end;
- child_max_end = get_subtree_max_end(node->rb_left);
+ child_max_end = get_subtree_max_end(data->rb.rb_left);
if (child_max_end > max_end)
max_end = child_max_end;
- data->subtree_max_end = max_end;
+ return max_end;
}
+RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
+ u64, subtree_max_end, compute_subtree_max_end)
+
/* Find the first (lowest start addr) overlapping range from rb tree */
static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
u64 start, u64 end)
@@ -179,15 +174,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
struct memtype *data = container_of(*node, struct memtype, rb);
parent = *node;
+ if (data->subtree_max_end < newdata->end)
+ data->subtree_max_end = newdata->end;
if (newdata->start <= data->start)
node = &((*node)->rb_left);
else if (newdata->start > data->start)
node = &((*node)->rb_right);
}
+ newdata->subtree_max_end = newdata->end;
rb_link_node(&newdata->rb, parent, node);
- rb_insert_color(&newdata->rb, root);
- rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL);
+ rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
}
int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
@@ -209,16 +206,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
struct memtype *rbt_memtype_erase(u64 start, u64 end)
{
- struct rb_node *deepest;
struct memtype *data;
data = memtype_rb_exact_match(&memtype_rbroot, start, end);
if (!data)
goto out;
- deepest = rb_augment_erase_begin(&data->rb);
- rb_erase(&data->rb, &memtype_rbroot);
- rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL);
+ rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
out:
return data;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8573b83a63d..6fb6927f9e7 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
struct page *pte;
pte = alloc_pages(__userpte_alloc_gfp, 0);
- if (pte)
- pgtable_page_ctor(pte);
+ if (!pte)
+ return NULL;
+ if (!pgtable_page_ctor(pte)) {
+ __free_page(pte);
+ return NULL;
+ }
return pte;
}
@@ -57,8 +61,17 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
#if PAGETABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
+ struct page *page = virt_to_page(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pmd));
+ /*
+ * NOTE! For PAE, any changes to the top page-directory-pointer-table
+ * entries need a full cr3 reload to flush.
+ */
+#ifdef CONFIG_X86_PAE
+ tlb->need_flush_all = 1;
+#endif
+ pgtable_pmd_page_dtor(page);
+ tlb_remove_page(tlb, page);
}
#if PAGETABLE_LEVELS > 3
@@ -137,7 +150,7 @@ static void pgd_dtor(pgd_t *pgd)
* against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed.
- * -- wli
+ * -- nyc
*/
#ifdef CONFIG_X86_PAE
@@ -182,8 +195,10 @@ static void free_pmds(pmd_t *pmds[])
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++)
- if (pmds[i])
+ if (pmds[i]) {
+ pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
+ }
}
static int preallocate_pmds(pmd_t *pmds[])
@@ -193,8 +208,13 @@ static int preallocate_pmds(pmd_t *pmds[])
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
- if (pmd == NULL)
+ if (!pmd)
+ failed = true;
+ if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
+ free_page((unsigned long)pmd);
+ pmd = NULL;
failed = true;
+ }
pmds[i] = pmd;
}
@@ -233,7 +253,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
pud_t *pud;
- unsigned long addr;
int i;
if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
@@ -241,8 +260,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
pud = pud_offset(pgd, 0);
- for (addr = i = 0; i < PREALLOCATED_PMDS;
- i++, pud++, addr += PUD_SIZE) {
+ for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
pmd_t *pmd = pmds[i];
if (i >= KERNEL_PGD_BOUNDARY)
@@ -301,6 +319,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
free_page((unsigned long)pgd);
}
+/*
+ * Used to set accessed or dirty bits in the page table entries
+ * on other architectures. On x86, the accessed and dirty bits
+ * are tracked by hardware. However, do_wp_page calls this function
+ * to also make the pte writeable at the same time the dirty bit is
+ * set. In that case we do actually need to write the PTE.
+ */
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
@@ -310,7 +335,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*ptep = entry;
pte_update_defer(vma->vm_mm, address, ptep);
- flush_tlb_page(vma, address);
}
return changed;
@@ -328,7 +352,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*pmdp = entry;
pmd_update_defer(vma->vm_mm, address, pmdp);
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ /*
+ * We had a write-protection fault here and changed the pmd
+ * to to more permissive. No need to flush the TLB for that,
+ * #PF is architecturally guaranteed to do that and in the
+ * worst-case we'll generate a spurious fault.
+ */
}
return changed;
@@ -370,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
- int young;
-
- young = ptep_test_and_clear_young(vma, address, ptep);
- if (young)
- flush_tlb_page(vma, address);
-
- return young;
+ /*
+ * On x86 CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+ return ptep_test_and_clear_young(vma, address, ptep);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -420,9 +456,9 @@ void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
BUG_ON(fixmaps_set > 0);
- printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
- (int)-reserve);
- __FIXADDR_TOP = -reserve - PAGE_SIZE;
+ __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
+ printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
+ -reserve, __FIXADDR_TOP + PAGE_SIZE);
#endif
}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cac71849925..4dd8cf65257 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -10,7 +10,6 @@
#include <linux/spinlock.h>
#include <linux/module.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
@@ -128,7 +127,7 @@ static int __init parse_reservetop(char *arg)
address = memparse(arg, &arg);
reserve_top_address(address);
- fixup_early_ioremap();
+ early_ioremap_init();
return 0;
}
early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index d2e2735327b..e666cbbb926 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
+#include <linux/bootmem.h>
#include <linux/mmdebug.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -8,33 +9,54 @@
#ifdef CONFIG_X86_64
+#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
- x += phys_base;
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
} else {
- VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(!phys_addr_valid(x));
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
}
+
return x;
}
EXPORT_SYMBOL(__phys_addr);
+unsigned long __phys_addr_symbol(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* only check upper bounds since lower bounds will trigger carry */
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+ return y + phys_base;
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
+#endif
+
bool __virt_addr_valid(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- if (x >= KERNEL_IMAGE_SIZE)
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ if (y >= KERNEL_IMAGE_SIZE)
return false;
- x += phys_base;
} else {
- if (x < PAGE_OFFSET)
- return false;
- x -= PAGE_OFFSET;
- if (!phys_addr_valid(x))
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ if ((x > y) || !phys_addr_valid(x))
return false;
}
@@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid);
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
+ unsigned long phys_addr = x - PAGE_OFFSET;
/* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
- return x - PAGE_OFFSET;
+ /* max_low_pfn is set early, but not _that_ early */
+ if (max_low_pfn) {
+ VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
+ BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
+ }
+ return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 410531d3c29..90555bf60aa 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -5,7 +5,7 @@
#include <asm/pgtable.h>
#include <asm/proto.h>
-static int disable_nx __cpuinitdata;
+static int disable_nx;
/*
* noexec = on|off
@@ -29,7 +29,7 @@ static int __init noexec_setup(char *str)
}
early_param("noexec", noexec_setup);
-void __cpuinit x86_configure_nx(void)
+void x86_configure_nx(void)
{
if (cpu_has_nx && !disable_nx)
__supported_pte_mask |= _PAGE_NX;
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 1c1c4f46a7c..66338a60aa6 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -42,15 +42,31 @@ static __init inline int srat_disabled(void)
return acpi_numa < 0;
}
-/* Callback for SLIT parsing */
+/*
+ * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
+ * I/O localities since SRAT does not list them. I/O localities are
+ * not supported at this point.
+ */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
int i, j;
- for (i = 0; i < slit->locality_count; i++)
- for (j = 0; j < slit->locality_count; j++)
- numa_set_distance(pxm_to_node(i), pxm_to_node(j),
+ for (i = 0; i < slit->locality_count; i++) {
+ const int from_node = pxm_to_node(i);
+
+ if (from_node == NUMA_NO_NODE)
+ continue;
+
+ for (j = 0; j < slit->locality_count; j++) {
+ const int to_node = pxm_to_node(j);
+
+ if (to_node == NUMA_NO_NODE)
+ continue;
+
+ numa_set_distance(from_node, to_node,
slit->entry[slit->locality_count * i + j]);
+ }
+ }
}
/* Callback for Proximity Domain -> x2APIC mapping */
@@ -70,7 +86,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
return;
pxm = pa->proximity_domain;
apic_id = pa->apic_id;
- if (!cpu_has_x2apic && (apic_id >= 0xff)) {
+ if (!apic->apic_id_valid(apic_id)) {
printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
pxm, apic_id);
return;
@@ -142,42 +158,55 @@ static inline int save_add_info(void) {return 0;}
#endif
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
-void __init
+int __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
u64 start, end;
+ u32 hotpluggable;
int node, pxm;
if (srat_disabled())
- return;
- if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
- bad_srat();
- return;
- }
+ goto out_err;
+ if (ma->header.length != sizeof(struct acpi_srat_mem_affinity))
+ goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
- return;
+ goto out_err;
+ hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+ if (hotpluggable && !save_add_info())
+ goto out_err;
- if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
- return;
start = ma->base_address;
end = start + ma->length;
pxm = ma->proximity_domain;
if (acpi_srat_revision <= 1)
pxm &= 0xff;
+
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains.\n");
- bad_srat();
- return;
+ goto out_err_bad_srat;
}
- if (numa_add_memblk(node, start, end) < 0) {
- bad_srat();
- return;
- }
+ if (numa_add_memblk(node, start, end) < 0)
+ goto out_err_bad_srat;
+
+ node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
- start, end);
+ pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n",
+ node, pxm,
+ (unsigned long long) start, (unsigned long long) end - 1,
+ hotpluggable ? " hotplug" : "");
+
+ /* Mark hotplug range in memblock. */
+ if (hotpluggable && memblock_mark_hotplug(start, ma->length))
+ pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
+ (unsigned long long)start, (unsigned long long)end - 1);
+
+ return 0;
+out_err_bad_srat:
+ bad_srat();
+out_err:
+ return -1;
}
void __init acpi_numa_arch_fixup(void) {}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d6c0418c3e4..dd8dda167a2 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
+#include <linux/debugfs.h>
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
= { &init_mm, 0, };
@@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
*
* More scalable flush, from Andi Kleen
*
- * To avoid global state use 8 different call vectors.
- * Each CPU uses a specific vector to trigger flushes on other
- * CPUs. Depending on the received vector the target CPUs look into
- * the right array slot for the flush data.
- *
- * With more than 8 CPUs they are hashed to the 8 available
- * vectors. The limited global vector space forces us to this right now.
- * In future when interrupts are split into per CPU domains this could be
- * fixed, at the cost of triggering multiple IPIs in some cases.
+ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
-union smp_flush_state {
- struct {
- struct mm_struct *flush_mm;
- unsigned long flush_va;
- raw_spinlock_t tlbstate_lock;
- DECLARE_BITMAP(flush_cpumask, NR_CPUS);
- };
- char pad[INTERNODE_CACHE_BYTES];
-} ____cacheline_internodealigned_in_smp;
-
-/* State is put into the per CPU data section, but padded
- to a full cache line because other CPUs can access it and we don't
- want false sharing in the per cpu data segment. */
-static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
-
-static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
+struct flush_tlb_info {
+ struct mm_struct *flush_mm;
+ unsigned long flush_start;
+ unsigned long flush_end;
+};
/*
* We cannot call mmdrop() because we are in interrupt context,
@@ -61,37 +43,36 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
*/
void leave_mm(int cpu)
{
- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- cpumask_clear_cpu(cpu,
- mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
- load_cr3(swapper_pg_dir);
+ if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
+ cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
+ load_cr3(swapper_pg_dir);
+ }
}
EXPORT_SYMBOL_GPL(leave_mm);
/*
- *
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
* 1) switch_mm() either 1a) or 1b)
* 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * for the wrong mm, and in the worst case we perform a superfluous
- * tlb flush.
- * 1a2) set cpu mmu_state to TLBSTATE_OK
- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- * was in lazy tlb mode.
- * 1a3) update cpu active_mm
+ * 1a1) set cpu_tlbstate to TLBSTATE_OK
+ * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
+ * if cpu0 was in lazy tlb mode.
+ * 1a2) update cpu active_mm
* Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
* Now the other cpus will send tlb flush ipis.
* 1a4) change cr3.
+ * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ * Stop ipi delivery for the old mm. This is not synchronized with
+ * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
+ * mm, and in the worst case we perform a superfluous tlb flush.
* 1b) thread switch without mm change
- * cpu active_mm is correct, cpu0 already handles
- * flush ipis.
- * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * cpu active_mm is correct, cpu0 already handles flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
* 1b2) test_and_set the cpu bit in cpu_vm_mask.
* Atomically set the bit [other cpus will start sending flush ipis],
* and test the bit.
@@ -104,203 +85,137 @@ EXPORT_SYMBOL_GPL(leave_mm);
* runs in kernel space, the cpu could load tlb entries for user space
* pages.
*
- * The good news is that cpu mmu_state is local to each cpu, no
+ * The good news is that cpu_tlbstate is local to each cpu, no
* write/read ordering problems.
*/
/*
- * TLB flush IPI:
- *
+ * TLB flush funcation:
* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
* 2) Leave the mm if we are in the lazy tlb mode.
- *
- * Interrupts are disabled.
- */
-
-/*
- * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
- * but still used for documentation purpose but the usage is slightly
- * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
- * entry calls in with the first parameter in %eax. Maybe define
- * intrlinkage?
*/
-#ifdef CONFIG_X86_64
-asmlinkage
-#endif
-void smp_invalidate_interrupt(struct pt_regs *regs)
+static void flush_tlb_func(void *info)
{
- unsigned int cpu;
- unsigned int sender;
- union smp_flush_state *f;
-
- cpu = smp_processor_id();
- /*
- * orig_rax contains the negated interrupt vector.
- * Use that to determine where the sender put the data.
- */
- sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
- f = &flush_state[sender];
-
- if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
- goto out;
- /*
- * This was a BUG() but until someone can quote me the
- * line from the intel manual that guarantees an IPI to
- * multiple CPUs is retried _only_ on the erroring CPUs
- * its staying as a return
- *
- * BUG();
- */
-
- if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_va == TLB_FLUSH_ALL)
- local_flush_tlb();
- else
- __flush_tlb_one(f->flush_va);
- } else
- leave_mm(cpu);
- }
-out:
- ack_APIC_irq();
- smp_mb__before_clear_bit();
- cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
- smp_mb__after_clear_bit();
+ struct flush_tlb_info *f = info;
+
inc_irq_stat(irq_tlb_count);
-}
-static void flush_tlb_others_ipi(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long va)
-{
- unsigned int sender;
- union smp_flush_state *f;
-
- /* Caller has disabled preemption */
- sender = this_cpu_read(tlb_vector_offset);
- f = &flush_state[sender];
-
- if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
- raw_spin_lock(&f->tlbstate_lock);
-
- f->flush_mm = mm;
- f->flush_va = va;
- if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
- /*
- * We have to send the IPI only to
- * CPUs affected.
- */
- apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
- INVALIDATE_TLB_VECTOR_START + sender);
-
- while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
- cpu_relax();
- }
+ if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+ return;
+
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+ if (f->flush_end == TLB_FLUSH_ALL)
+ local_flush_tlb();
+ else if (!f->flush_end)
+ __flush_tlb_single(f->flush_start);
+ else {
+ unsigned long addr;
+ addr = f->flush_start;
+ while (addr < f->flush_end) {
+ __flush_tlb_single(addr);
+ addr += PAGE_SIZE;
+ }
+ }
+ } else
+ leave_mm(smp_processor_id());
- f->flush_mm = NULL;
- f->flush_va = 0;
- if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
- raw_spin_unlock(&f->tlbstate_lock);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long va)
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end)
{
+ struct flush_tlb_info info;
+ info.flush_mm = mm;
+ info.flush_start = start;
+ info.flush_end = end;
+
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (is_uv_system()) {
unsigned int cpu;
cpu = smp_processor_id();
- cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+ cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
if (cpumask)
- flush_tlb_others_ipi(cpumask, mm, va);
+ smp_call_function_many(cpumask, flush_tlb_func,
+ &info, 1);
return;
}
- flush_tlb_others_ipi(cpumask, mm, va);
+ smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
}
-static void __cpuinit calculate_tlb_offset(void)
-{
- int cpu, node, nr_node_vecs, idx = 0;
- /*
- * we are changing tlb_vector_offset for each CPU in runtime, but this
- * will not cause inconsistency, as the write is atomic under X86. we
- * might see more lock contentions in a short time, but after all CPU's
- * tlb_vector_offset are changed, everything should go normal
- *
- * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
- * waste some vectors.
- **/
- if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
- nr_node_vecs = 1;
- else
- nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
-
- for_each_online_node(node) {
- int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
- nr_node_vecs;
- int cpu_offset = 0;
- for_each_cpu(cpu, cpumask_of_node(node)) {
- per_cpu(tlb_vector_offset, cpu) = node_offset +
- cpu_offset;
- cpu_offset++;
- cpu_offset = cpu_offset % nr_node_vecs;
- }
- idx++;
- }
-}
-
-static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- switch (action & 0xf) {
- case CPU_ONLINE:
- case CPU_DEAD:
- calculate_tlb_offset();
- }
- return NOTIFY_OK;
-}
-
-static int __cpuinit init_smp_flush(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(flush_state); i++)
- raw_spin_lock_init(&flush_state[i].tlbstate_lock);
-
- calculate_tlb_offset();
- hotcpu_notifier(tlb_cpuhp_notify, 0);
- return 0;
-}
-core_initcall(init_smp_flush);
-
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
preempt_disable();
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
+ flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
-void flush_tlb_mm(struct mm_struct *mm)
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long vmflag)
{
+ unsigned long addr;
+ unsigned act_entries, tlb_entries = 0;
+ unsigned long nr_base_pages;
+
preempt_disable();
+ if (current->active_mm != mm)
+ goto flush_all;
- if (current->active_mm == mm) {
- if (current->mm)
- local_flush_tlb();
- else
- leave_mm(smp_processor_id());
+ if (!current->mm) {
+ leave_mm(smp_processor_id());
+ goto flush_all;
}
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
+ if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
+ || vmflag & VM_HUGETLB) {
+ local_flush_tlb();
+ goto flush_all;
+ }
+
+ /* In modern CPU, last level tlb used for both data/ins */
+ if (vmflag & VM_EXEC)
+ tlb_entries = tlb_lli_4k[ENTRIES];
+ else
+ tlb_entries = tlb_lld_4k[ENTRIES];
+
+ /* Assume all of TLB entries was occupied by this task */
+ act_entries = tlb_entries >> tlb_flushall_shift;
+ act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
+ nr_base_pages = (end - start) >> PAGE_SHIFT;
+
+ /* tlb_flushall_shift is on balance point, details in commit log */
+ if (nr_base_pages > act_entries) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ } else {
+ /* flush range by one by one 'invlpg' */
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+ __flush_tlb_single(addr);
+ }
+
+ if (cpumask_any_but(mm_cpumask(mm),
+ smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, start, end);
+ preempt_enable();
+ return;
+ }
+
+flush_all:
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
{
struct mm_struct *mm = vma->vm_mm;
@@ -308,25 +223,105 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
if (current->active_mm == mm) {
if (current->mm)
- __flush_tlb_one(va);
+ __flush_tlb_one(start);
else
leave_mm(smp_processor_id());
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, va);
+ flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
preempt_enable();
}
static void do_flush_tlb_all(void *info)
{
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
}
void flush_tlb_all(void)
{
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
+
+static void do_kernel_range_flush(void *info)
+{
+ struct flush_tlb_info *f = info;
+ unsigned long addr;
+
+ /* flush range by one by one 'invlpg' */
+ for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+ __flush_tlb_single(addr);
+}
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ unsigned act_entries;
+ struct flush_tlb_info info;
+
+ /* In modern CPU, last level tlb used for both data/ins */
+ act_entries = tlb_lld_4k[ENTRIES];
+
+ /* Balance as user space task's flush, a bit conservative */
+ if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
+ (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ else {
+ info.flush_start = start;
+ info.flush_end = end;
+ on_each_cpu(do_kernel_range_flush, &info, 1);
+ }
+}
+
+#ifdef CONFIG_DEBUG_TLBFLUSH
+static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[32];
+ unsigned int len;
+
+ len = sprintf(buf, "%hd\n", tlb_flushall_shift);
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t tlbflush_write_file(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ ssize_t len;
+ s8 shift;
+
+ len = min(count, sizeof(buf) - 1);
+ if (copy_from_user(buf, user_buf, len))
+ return -EFAULT;
+
+ buf[len] = '\0';
+ if (kstrtos8(buf, 0, &shift))
+ return -EINVAL;
+
+ if (shift < -1 || shift >= BITS_PER_LONG)
+ return -EINVAL;
+
+ tlb_flushall_shift = shift;
+ return count;
+}
+
+static const struct file_operations fops_tlbflush = {
+ .read = tlbflush_read_file,
+ .write = tlbflush_write_file,
+ .llseek = default_llseek,
+};
+
+static int __init create_tlb_flushall_shift(void)
+{
+ debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
+ arch_debugfs_dir, NULL, &fops_tlbflush);
+ return 0;
+}
+late_initcall(create_tlb_flushall_shift);
+#endif