aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile9
-rw-r--r--arch/x86/mm/amdtopology.c (renamed from arch/x86/mm/k8topology_64.c)117
-rw-r--r--arch/x86/mm/dump_pagetables.c118
-rw-r--r--arch/x86/mm/extable.c144
-rw-r--r--arch/x86/mm/fault.c307
-rw-r--r--arch/x86/mm/gup.c41
-rw-r--r--arch/x86/mm/highmem_32.c13
-rw-r--r--arch/x86/mm/hugetlbpage.c318
-rw-r--r--arch/x86/mm/init.c636
-rw-r--r--arch/x86/mm/init_32.c265
-rw-r--r--arch/x86/mm/init_64.c887
-rw-r--r--arch/x86/mm/ioremap.c280
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c8
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c1
-rw-r--r--arch/x86/mm/kmmio.c1
-rw-r--r--arch/x86/mm/memblock.c348
-rw-r--r--arch/x86/mm/memtest.c43
-rw-r--r--arch/x86/mm/mm_internal.h19
-rw-r--r--arch/x86/mm/mmap.c32
-rw-r--r--arch/x86/mm/mmio-mod.c11
-rw-r--r--arch/x86/mm/numa.c843
-rw-r--r--arch/x86/mm/numa_32.c365
-rw-r--r--arch/x86/mm/numa_64.c846
-rw-r--r--arch/x86/mm/numa_emulation.c502
-rw-r--r--arch/x86/mm/numa_internal.h33
-rw-r--r--arch/x86/mm/pageattr-test.c13
-rw-r--r--arch/x86/mm/pageattr.c702
-rw-r--r--arch/x86/mm/pat.c196
-rw-r--r--arch/x86/mm/pat_rbtree.c34
-rw-r--r--arch/x86/mm/pf_in.c14
-rw-r--r--arch/x86/mm/pgtable.c146
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/mm/physaddr.c60
-rw-r--r--arch/x86/mm/setup_nx.c6
-rw-r--r--arch/x86/mm/srat.c222
-rw-r--r--arch/x86/mm/srat_32.c286
-rw-r--r--arch/x86/mm/srat_64.c565
-rw-r--r--arch/x86/mm/tlb.c400
39 files changed, 4625 insertions, 4211 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 55543397a8a..6a19ad9f370 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
obj-$(CONFIG_SMP) += tlb.o
@@ -23,9 +25,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
-obj-$(CONFIG_K8_NUMA) += k8topology_64.o
-obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
-
-obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+obj-$(CONFIG_AMD_NUMA) += amdtopology.o
+obj-$(CONFIG_ACPI_NUMA) += srat.o
+obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology.c
index 804a3b6c6e1..2ca15b59fb3 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/amdtopology.c
@@ -1,8 +1,8 @@
/*
- * AMD K8 NUMA support.
+ * AMD NUMA support.
* Discover the memory map and associated nodes.
*
- * This version reads it directly from the K8 northbridge.
+ * This version reads it directly from the AMD northbridge.
*
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/io.h>
#include <linux/pci_ids.h>
@@ -26,8 +27,7 @@
#include <asm/apic.h>
#include <asm/amd_nb.h>
-static struct bootnode __initdata nodes[8];
-static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
+static unsigned char __initdata nodeids[8];
static __init int find_northbridge(void)
{
@@ -50,14 +50,14 @@ static __init int find_northbridge(void)
return num;
}
- return -1;
+ return -ENOENT;
}
static __init void early_get_boot_cpu_id(void)
{
/*
* need to get the APIC ID of the BSP so can use that to
- * create apicid_to_node in k8_scan_nodes()
+ * create apicid_to_node in amd_scan_nodes()
*/
#ifdef CONFIG_X86_MPPARSE
/*
@@ -66,33 +66,20 @@ static __init void early_get_boot_cpu_id(void)
if (smp_found_config)
early_get_smp_config();
#endif
- early_init_lapic_mapping();
}
-int __init k8_get_nodes(struct bootnode *physnodes)
+int __init amd_numa_init(void)
{
- int i;
- int ret = 0;
-
- for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
- }
- return ret;
-}
-
-int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long start = PFN_PHYS(start_pfn);
- unsigned long end = PFN_PHYS(end_pfn);
+ u64 start = PFN_PHYS(0);
+ u64 end = PFN_PHYS(max_pfn);
unsigned numnodes;
- unsigned long prevbase;
- int i, nb, found = 0;
+ u64 prevbase;
+ int i, j, nb;
u32 nodeid, reg;
+ unsigned int bits, cores, apicid_base;
if (!early_pci_allowed())
- return -1;
+ return -EINVAL;
nb = find_northbridge();
if (nb < 0)
@@ -103,49 +90,48 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
reg = read_pci_config(0, nb, 0, 0x60);
numnodes = ((reg >> 4) & 0xF) + 1;
if (numnodes <= 1)
- return -1;
+ return -ENOENT;
pr_info("Number of physical nodes %d\n", numnodes);
prevbase = 0;
for (i = 0; i < 8; i++) {
- unsigned long base, limit;
+ u64 base, limit;
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
- nodeid = limit & 7;
+ nodeids[i] = nodeid = limit & 7;
if ((base & 3) == 0) {
if (i < numnodes)
pr_info("Skipping disabled node %d\n", i);
continue;
}
if (nodeid >= numnodes) {
- pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+ pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
base, limit);
continue;
}
if (!limit) {
- pr_info("Skipping node entry %d (base %lx)\n",
+ pr_info("Skipping node entry %d (base %Lx)\n",
i, base);
continue;
}
if ((base >> 8) & 3 || (limit >> 8) & 3) {
- pr_err("Node %d using interleaving mode %lx/%lx\n",
+ pr_err("Node %d using interleaving mode %Lx/%Lx\n",
nodeid, (base >> 8) & 3, (limit >> 8) & 3);
- return -1;
+ return -EINVAL;
}
- if (node_isset(nodeid, nodes_parsed)) {
+ if (node_isset(nodeid, numa_nodes_parsed)) {
pr_info("Node %d already present, skipping\n",
nodeid);
continue;
}
limit >>= 16;
- limit <<= 24;
- limit |= (1<<24)-1;
limit++;
+ limit <<= 24;
if (limit > end)
limit = end;
@@ -164,56 +150,37 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
continue;
}
if (limit < base) {
- pr_err("Node %d bogus settings %lx-%lx.\n",
+ pr_err("Node %d bogus settings %Lx-%Lx.\n",
nodeid, base, limit);
continue;
}
/* Could sort here, but pun for now. Should not happen anyroads. */
if (prevbase > base) {
- pr_err("Node map not sorted %lx,%lx\n",
+ pr_err("Node map not sorted %Lx,%Lx\n",
prevbase, base);
- return -1;
+ return -EINVAL;
}
- pr_info("Node %d MemBase %016lx Limit %016lx\n",
+ pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
nodeid, base, limit);
- found++;
-
- nodes[nodeid].start = base;
- nodes[nodeid].end = limit;
-
prevbase = base;
-
- node_set(nodeid, nodes_parsed);
+ numa_add_memblk(nodeid, base, limit);
+ node_set(nodeid, numa_nodes_parsed);
}
- if (!found)
- return -1;
- return 0;
-}
-
-int __init k8_scan_nodes(void)
-{
- unsigned int bits;
- unsigned int cores;
- unsigned int apicid_base;
- int i;
-
- BUG_ON(nodes_empty(nodes_parsed));
- node_possible_map = nodes_parsed;
- memnode_shift = compute_hash_shift(nodes, 8, NULL);
- if (memnode_shift < 0) {
- pr_err("No NUMA node hash function found. Contact maintainer\n");
- return -1;
- }
- pr_info("Using node hash shift of %d\n", memnode_shift);
+ if (!nodes_weight(numa_nodes_parsed))
+ return -ENOENT;
- /* use the coreid bits from early_identify_cpu */
+ /*
+ * We seem to have valid NUMA configuration. Map apicids to nodes
+ * using the coreid bits from early_identify_cpu.
+ */
bits = boot_cpu_data.x86_coreid_bits;
- cores = (1<<bits);
+ cores = 1 << bits;
apicid_base = 0;
+
/* get the APIC ID of the BSP early for systems with apicid lifting */
early_get_boot_cpu_id();
if (boot_cpu_physical_apicid > 0) {
@@ -221,17 +188,9 @@ int __init k8_scan_nodes(void)
apicid_base = boot_cpu_physical_apicid;
}
- for_each_node_mask(i, node_possible_map) {
- int j;
-
- memblock_x86_register_active_regions(i,
- nodes[i].start >> PAGE_SHIFT,
- nodes[i].end >> PAGE_SHIFT);
+ for_each_node_mask(i, numa_nodes_parsed)
for (j = apicid_base; j < cores + apicid_base; j++)
- apicid_to_node[(i << bits) + j] = i;
- setup_node_bootmem(i, nodes[i].start, nodes[i].end);
- }
+ set_apicid_to_node((i << bits) + j, i);
- numa_init_array();
return 0;
}
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0002a3a3308..167ffcac16e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -30,11 +30,14 @@ struct pg_state {
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
+ unsigned long lines;
+ bool to_dmesg;
};
struct addr_marker {
unsigned long start_address;
const char *name;
+ unsigned long max_lines;
};
/* indices for address_markers; keep sync'd w/ address_markers below */
@@ -45,6 +48,7 @@ enum address_markers_idx {
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
+ ESPFIX_START_NR,
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
@@ -67,6 +71,7 @@ static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
@@ -88,10 +93,28 @@ static struct addr_marker address_markers[] = {
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_INFO fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_CONT fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
/*
* Print a readable form of a pgprot_t to the seq_file
*/
-static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
{
pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
@@ -99,47 +122,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
if (!pgprot_val(prot)) {
/* Not present */
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
} else {
if (pr & _PAGE_USER)
- seq_printf(m, "USR ");
+ pt_dump_cont_printf(m, dmsg, "USR ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_RW)
- seq_printf(m, "RW ");
+ pt_dump_cont_printf(m, dmsg, "RW ");
else
- seq_printf(m, "ro ");
+ pt_dump_cont_printf(m, dmsg, "ro ");
if (pr & _PAGE_PWT)
- seq_printf(m, "PWT ");
+ pt_dump_cont_printf(m, dmsg, "PWT ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_PCD)
- seq_printf(m, "PCD ");
+ pt_dump_cont_printf(m, dmsg, "PCD ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
/* Bit 9 has a different meaning on level 3 vs 4 */
if (level <= 3) {
if (pr & _PAGE_PSE)
- seq_printf(m, "PSE ");
+ pt_dump_cont_printf(m, dmsg, "PSE ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
} else {
if (pr & _PAGE_PAT)
- seq_printf(m, "pat ");
+ pt_dump_cont_printf(m, dmsg, "pat ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
}
if (pr & _PAGE_GLOBAL)
- seq_printf(m, "GLB ");
+ pt_dump_cont_printf(m, dmsg, "GLB ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_NX)
- seq_printf(m, "NX ");
+ pt_dump_cont_printf(m, dmsg, "NX ");
else
- seq_printf(m, "x ");
+ pt_dump_cont_printf(m, dmsg, "x ");
}
- seq_printf(m, "%s\n", level_name[level]);
+ pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
}
/*
@@ -163,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
pgprot_t new_prot, int level)
{
pgprotval_t prot, cur;
- static const char units[] = "KMGTPE";
+ static const char units[] = "BKMGTPE";
/*
* If we have a "break" in the series, we need to flush the state that
@@ -178,7 +201,9 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
@@ -188,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st,
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
-
- delta = (st->current_address - st->start_address) >> 10;
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
+ if (!st->marker->max_lines ||
+ st->lines < st->marker->max_lines) {
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
+
+ delta = st->current_address - st->start_address;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
+ delta, *unit);
+ printk_prot(m, st->current_prot, st->level,
+ st->to_dmesg);
}
- seq_printf(m, "%9lu%c ", delta, *unit);
- printk_prot(m, st->current_prot, st->level);
+ st->lines++;
/*
* We print markers for special areas of address space,
@@ -206,8 +238,19 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* This helps in the interpretation.
*/
if (st->current_address >= st->marker[1].start_address) {
+ if (st->marker->max_lines &&
+ st->lines > st->marker->max_lines) {
+ unsigned long nskip =
+ st->lines - st->marker->max_lines;
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "... %lu entr%s skipped ... \n",
+ nskip,
+ nskip == 1 ? "y" : "ies");
+ }
st->marker++;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
}
st->start_address = st->current_address;
@@ -296,7 +339,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
-static void walk_pgd_level(struct seq_file *m)
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt;
@@ -304,9 +347,12 @@ static void walk_pgd_level(struct seq_file *m)
pgd_t *start = swapper_pg_dir;
#endif
int i;
- struct pg_state st;
+ struct pg_state st = {};
- memset(&st, 0, sizeof(st));
+ if (pgd) {
+ start = pgd;
+ st.to_dmesg = true;
+ }
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
@@ -331,7 +377,7 @@ static void walk_pgd_level(struct seq_file *m)
static int ptdump_show(struct seq_file *m, void *v)
{
- walk_pgd_level(m);
+ ptdump_walk_pgd_level(m, NULL);
return 0;
}
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index d0474ad2a6e..903ec1e9c32 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,11 +1,23 @@
#include <linux/module.h>
#include <linux/spinlock.h>
+#include <linux/sort.h>
#include <asm/uaccess.h>
+static inline unsigned long
+ex_insn_addr(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->insn + x->insn;
+}
+static inline unsigned long
+ex_fixup_addr(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->fixup + x->fixup;
+}
int fixup_exception(struct pt_regs *regs)
{
const struct exception_table_entry *fixup;
+ unsigned long new_ip;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -23,15 +35,135 @@ int fixup_exception(struct pt_regs *regs)
fixup = search_exception_tables(regs->ip);
if (fixup) {
- /* If fixup is less than 16, it means uaccess error */
- if (fixup->fixup < 16) {
- current_thread_info()->uaccess_err = -EFAULT;
- regs->ip += fixup->fixup;
- return 1;
+ new_ip = ex_fixup_addr(fixup);
+
+ if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+ /* Special hack for uaccess_err */
+ current_thread_info()->uaccess_err = 1;
+ new_ip -= 0x7ffffff0;
}
- regs->ip = fixup->fixup;
+ regs->ip = new_ip;
return 1;
}
return 0;
}
+
+/* Restricted version used during very early boot */
+int __init early_fixup_exception(unsigned long *ip)
+{
+ const struct exception_table_entry *fixup;
+ unsigned long new_ip;
+
+ fixup = search_exception_tables(*ip);
+ if (fixup) {
+ new_ip = ex_fixup_addr(fixup);
+
+ if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+ /* uaccess handling not supported during early boot */
+ return 0;
+ }
+
+ *ip = new_ip;
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Search one exception table for an entry corresponding to the
+ * given instruction address, and return the address of the entry,
+ * or NULL if none is found.
+ * We use a binary search, and thus we assume that the table is
+ * already sorted.
+ */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+ const struct exception_table_entry *last,
+ unsigned long value)
+{
+ while (first <= last) {
+ const struct exception_table_entry *mid;
+ unsigned long addr;
+
+ mid = ((last - first) >> 1) + first;
+ addr = ex_insn_addr(mid);
+ if (addr < value)
+ first = mid + 1;
+ else if (addr > value)
+ last = mid - 1;
+ else
+ return mid;
+ }
+ return NULL;
+}
+
+/*
+ * The exception table needs to be sorted so that the binary
+ * search that we use to find entries in it works properly.
+ * This is used both for the kernel exception table and for
+ * the exception tables of modules that get loaded.
+ *
+ */
+static int cmp_ex(const void *a, const void *b)
+{
+ const struct exception_table_entry *x = a, *y = b;
+
+ /*
+ * This value will always end up fittin in an int, because on
+ * both i386 and x86-64 the kernel symbol-reachable address
+ * space is < 2 GiB.
+ *
+ * This compare is only valid after normalization.
+ */
+ return x->insn - y->insn;
+}
+
+void sort_extable(struct exception_table_entry *start,
+ struct exception_table_entry *finish)
+{
+ struct exception_table_entry *p;
+ int i;
+
+ /* Convert all entries to being relative to the start of the section */
+ i = 0;
+ for (p = start; p < finish; p++) {
+ p->insn += i;
+ i += 4;
+ p->fixup += i;
+ i += 4;
+ }
+
+ sort(start, finish - start, sizeof(struct exception_table_entry),
+ cmp_ex, NULL);
+
+ /* Denormalize all entries */
+ i = 0;
+ for (p = start; p < finish; p++) {
+ p->insn -= i;
+ i += 4;
+ p->fixup -= i;
+ i += 4;
+ }
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * If the exception table is sorted, any referring to the module init
+ * will be at the beginning or the end.
+ */
+void trim_init_extable(struct module *m)
+{
+ /*trim the beginning*/
+ while (m->num_exentries &&
+ within_module_init(ex_insn_addr(&m->extable[0]), m)) {
+ m->extable++;
+ m->num_exentries--;
+ }
+ /*trim the end*/
+ while (m->num_exentries &&
+ within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
+ m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7d90ceb882a..36642793e31 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,14 +8,21 @@
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/module.h> /* search_exception_table */
#include <linux/bootmem.h> /* max_low_pfn */
-#include <linux/kprobes.h> /* __kprobes, ... */
+#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
+#include <linux/prefetch.h> /* prefetchw */
+#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
+#include <asm/fixmap.h> /* VSYSCALL_ADDR */
+#include <asm/vsyscall.h> /* emulate_vsyscall */
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
/*
* Page fault error code bits:
@@ -39,7 +46,7 @@ enum x86_pf_error_code {
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
-static inline int __kprobes
+static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
if (unlikely(is_kmmio_active()))
@@ -48,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
return 0;
}
-static inline int __kprobes notify_page_fault(struct pt_regs *regs)
+static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
{
int ret = 0;
@@ -104,7 +111,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
* but for now it's good enough to assume that long
* mode only uses well known segments or kernel.
*/
- return (!user_mode(regs)) || (regs->cs == __USER_CS);
+ return (!user_mode(regs) || user_64bit_mode(regs));
#endif
case 0x60:
/* 0x64 thru 0x67 are valid prefixes in all modes. */
@@ -229,15 +236,14 @@ void vmalloc_sync_all(void)
for (address = VMALLOC_START & PMD_MASK;
address >= TASK_SIZE && address < FIXADDR_TOP;
address += PMD_SIZE) {
-
- unsigned long flags;
struct page *page;
- spin_lock_irqsave(&pgd_lock, flags);
+ spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
spinlock_t *pgt_lock;
pmd_t *ret;
+ /* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
@@ -247,7 +253,7 @@ void vmalloc_sync_all(void)
if (!ret)
break;
}
- spin_unlock_irqrestore(&pgd_lock, flags);
+ spin_unlock(&pgd_lock);
}
}
@@ -256,7 +262,7 @@ void vmalloc_sync_all(void)
*
* Handle a fault on the vmalloc or module mapping area
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
@@ -286,6 +292,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
/*
* Did it hit the DOS screen memory VA from vm86 mode?
@@ -353,7 +360,7 @@ void vmalloc_sync_all(void)
*
* This assumes no large pages in there.
*/
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
pud_t *pud, *pud_ref;
@@ -376,10 +383,12 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd_ref))
return -1;
- if (pgd_none(*pgd))
+ if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
- else
+ arch_flush_lazy_mmu_mode();
+ } else {
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+ }
/*
* Below here mismatches are bugs because these lower tables
@@ -418,13 +427,16 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
return 0;
}
+NOKPROBE_SYMBOL(vmalloc_fault);
+#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
"******* Working around it, but it may cause SEGVs or burn power.\n"
"******* Please consider a BIOS update.\n"
"******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
/*
* No vm86 mode in 64-bit mode:
@@ -504,7 +516,11 @@ bad:
*/
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+ || boot_cpu_data.x86 != 0xf)
+ return 0;
+
if (address != regs->ip)
return 0;
@@ -547,7 +563,7 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
/*
* Pentium F0 0F C7 C8 bug workaround:
*/
- if (boot_cpu_data.f00f_bug) {
+ if (boot_cpu_has_bug(X86_BUG_F00F)) {
nr = (address - idt_descr.address) >> 3;
if (nr == 6) {
@@ -571,11 +587,16 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (error_code & PF_INSTR) {
unsigned int level;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd += pgd_index(address);
- pte_t *pte = lookup_address(address, &level);
+ pte = lookup_address_in_pgd(pgd, address, &level);
if (pte && pte_present(*pte) && !pte_exec(*pte))
- printk(nx_warning, current_uid());
+ printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
}
printk(KERN_ALERT "BUG: unable to handle kernel ");
@@ -586,7 +607,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(KERN_CONT " at %p\n", (void *) address);
printk(KERN_ALERT "IP:");
- printk_address(regs->ip, 1);
+ printk_address(regs->ip);
dump_pagetable(address);
}
@@ -608,7 +629,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
dump_pagetable(address);
tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
+ tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code;
if (__die("Bad pagetable", regs, error_code))
@@ -619,7 +640,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
static noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+ unsigned long address, int signal, int si_code)
{
struct task_struct *tsk = current;
unsigned long *stackend;
@@ -627,8 +648,35 @@ no_context(struct pt_regs *regs, unsigned long error_code,
int sig;
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs))
+ if (fixup_exception(regs)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
+ if (current_thread_info()->sig_on_uaccess_error && signal) {
+ tsk->thread.trap_nr = X86_TRAP_PF;
+ tsk->thread.error_code = error_code | PF_USER;
+ tsk->thread.cr2 = address;
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_info_fault(signal, si_code, address, tsk, 0);
+ }
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
return;
+ }
/*
* 32-bit:
@@ -657,10 +705,10 @@ no_context(struct pt_regs *regs, unsigned long error_code,
stackend = end_of_stack(tsk);
if (tsk != &init_task && *stackend != STACK_END_MAGIC)
- printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
+ tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code;
sig = SIGKILL;
@@ -668,7 +716,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
sig = 0;
/* Executive summary in case the body of the oops scrolled away */
- printk(KERN_EMERG "CR2: %016lx\n", address);
+ printk(KERN_DEFAULT "CR2: %016lx\n", address);
oops_end(flags, regs, sig);
}
@@ -720,13 +768,27 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_errata100(regs, address))
return;
- if (unlikely(show_unhandled_signals))
+#ifdef CONFIG_X86_64
+ /*
+ * Instruction fetch faults in the vsyscall page might need
+ * emulation.
+ */
+ if (unlikely((error_code & PF_INSTR) &&
+ ((address & ~0xfff) == VSYSCALL_ADDR))) {
+ if (emulate_vsyscall(regs, address))
+ return;
+ }
+#endif
+ /* Kernel addresses are always protection faults: */
+ if (address >= TASK_SIZE)
+ error_code |= PF_PROT;
+
+ if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
- /* Kernel addresses are always protection faults: */
tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
- tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_PF;
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
@@ -736,7 +798,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_f00f_bug(regs, address))
return;
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address, SIGSEGV, si_code);
}
static noinline void
@@ -774,20 +836,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
__bad_area(regs, error_code, address, SEGV_ACCERR);
}
-/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
-static void
-out_of_memory(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
-{
- /*
- * We ran out of memory, call the OOM killer, and return the userspace
- * (which will retry the fault, or kill us if we got oom-killed):
- */
- up_read(&current->mm->mmap_sem);
-
- pagefault_out_of_memory();
-}
-
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
@@ -800,7 +848,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -810,7 +858,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
tsk->thread.cr2 = address;
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 14;
+ tsk->thread.trap_nr = X86_TRAP_PF;
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
@@ -827,8 +875,29 @@ static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
+ if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ up_read(&current->mm->mmap_sem);
+ no_context(regs, error_code, address, 0, 0);
+ return;
+ }
+
if (fault & VM_FAULT_OOM) {
- out_of_memory(regs, error_code, address);
+ /* Kernel mode? Handle exceptions or die: */
+ if (!(error_code & PF_USER)) {
+ up_read(&current->mm->mmap_sem);
+ no_context(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
+ return;
+ }
+
+ up_read(&current->mm->mmap_sem);
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
@@ -861,7 +930,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static noinline __kprobes int
+static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
@@ -892,14 +961,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd);
- /*
- * Note: don't use pte_present() here, since it returns true
- * if the _PAGE_PROTNONE bit is set. However, this aliases the
- * _PAGE_GLOBAL bit, which for kernel pages give false positives
- * when CONFIG_DEBUG_PAGEALLOC is used.
- */
pte = pte_offset_kernel(pmd, address);
- if (!(pte_flags(*pte) & _PAGE_PRESENT))
+ if (!pte_present(*pte))
return 0;
ret = spurious_fault_check(error_code, pte);
@@ -915,6 +978,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
return ret;
}
+NOKPROBE_SYMBOL(spurious_fault);
int show_unhandled_signals = 1;
@@ -944,29 +1008,45 @@ static int fault_in_kernel_space(unsigned long address)
return address >= TASK_SIZE_MAX;
}
+static inline bool smap_violation(int error_code, struct pt_regs *regs)
+{
+ if (!IS_ENABLED(CONFIG_X86_SMAP))
+ return false;
+
+ if (!static_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
+ if (error_code & PF_USER)
+ return false;
+
+ if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC))
+ return false;
+
+ return true;
+}
+
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
+ *
+ * This function must have noinline because both callers
+ * {,trace_}do_page_fault() have notrace on. Having this an actual function
+ * guarantees there's a function trace entry.
*/
-dotraplinkage void __kprobes
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- unsigned long address;
struct mm_struct *mm;
int fault;
- int write = error_code & PF_WRITE;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
- (write ? FAULT_FLAG_WRITE : 0);
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
- /* Get the faulting address: */
- address = read_cr2();
-
/*
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
@@ -1005,7 +1085,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
return;
/* kprobes don't want to hook the spurious faults: */
- if (notify_page_fault(regs))
+ if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
@@ -1017,8 +1097,26 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
/* kprobes don't want to hook the spurious faults: */
- if (unlikely(notify_page_fault(regs)))
+ if (unlikely(kprobes_fault(regs)))
+ return;
+
+ if (unlikely(error_code & PF_RSVD))
+ pgtable_bad(regs, error_code, address);
+
+ if (unlikely(smap_violation(error_code, regs))) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * If we're in an interrupt, have no user context or are running
+ * in an atomic region then we must not take the fault:
+ */
+ if (unlikely(in_atomic() || !mm)) {
+ bad_area_nosemaphore(regs, error_code, address);
return;
+ }
+
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
@@ -1029,24 +1127,16 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (user_mode_vm(regs)) {
local_irq_enable();
error_code |= PF_USER;
+ flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
- if (unlikely(error_code & PF_RSVD))
- pgtable_bad(regs, error_code, address);
-
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- /*
- * If we're in an interrupt, have no user context or are running
- * in an atomic region then we must not take the fault:
- */
- if (unlikely(in_atomic() || !mm)) {
- bad_area_nosemaphore(regs, error_code, address);
- return;
- }
+ if (error_code & PF_WRITE)
+ flags |= FAULT_FLAG_WRITE;
/*
* When running in the kernel we expect faults to occur only to
@@ -1126,6 +1216,14 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, flags);
+ /*
+ * If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_sem because it
+ * would already be released in __lock_page_or_retry in mm/filemap.c.
+ */
+ if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
@@ -1139,17 +1237,18 @@ good_area:
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
regs, address);
} else {
tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
regs, address);
}
if (fault & VM_FAULT_RETRY) {
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
* of starvation. */
flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
goto retry;
}
}
@@ -1158,3 +1257,55 @@ good_area:
up_read(&mm->mmap_sem);
}
+NOKPROBE_SYMBOL(__do_page_fault);
+
+dotraplinkage void notrace
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ unsigned long address = read_cr2(); /* Get the faulting address */
+ enum ctx_state prev_state;
+
+ /*
+ * We must have this function tagged with __kprobes, notrace and call
+ * read_cr2() before calling anything else. To avoid calling any kind
+ * of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contain all sorts of tracepoints.
+ */
+
+ prev_state = exception_enter();
+ __do_page_fault(regs, error_code, address);
+ exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(do_page_fault);
+
+#ifdef CONFIG_TRACING
+static nokprobe_inline void
+trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_page_fault_user(address, regs, error_code);
+ else
+ trace_page_fault_kernel(address, regs, error_code);
+}
+
+dotraplinkage void notrace
+trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ /*
+ * The exception_enter and tracepoint processing could
+ * trigger another page faults (user space callchain
+ * reading) and destroy the original cr2 value, so read
+ * the faulting address now.
+ */
+ unsigned long address = read_cr2();
+ enum ctx_state prev_state;
+
+ prev_state = exception_enter();
+ trace_page_fault_entries(address, regs, error_code);
+ __do_page_fault(regs, error_code, address);
+ exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(trace_do_page_fault);
+#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799..207d9aef662 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
#include <asm/pgtable.h>
@@ -82,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_t pte = gup_get_pte(ptep);
struct page *page;
+ /* Similar to the PMD case, NUMA hinting must take slow path */
+ if (pte_numa(pte)) {
+ pte_unmap(ptep);
+ return 0;
+ }
+
if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
@@ -89,6 +96,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -100,9 +108,10 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
static inline void get_head_page_multiple(struct page *page, int nr)
{
- VM_BUG_ON(page != compound_head(page));
- VM_BUG_ON(page_count(page) == 0);
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
+ VM_BUG_ON_PAGE(page_count(page) == 0, page);
atomic_add(nr, &page->_count);
+ SetPageReferenced(page);
}
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -126,8 +135,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON(compound_head(page) != head);
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
+ if (PageTail(page))
+ get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -148,9 +159,27 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
+ /*
+ * The pmd_trans_splitting() check below explains why
+ * pmdp_splitting_flush has to flush the tlb, to stop
+ * this gup-fast code from running while we set the
+ * splitting bit in the pmd. Returning zero will take
+ * the slow path that will call wait_split_huge_page()
+ * if the pmd is still in splitting state. gup-fast
+ * can't because it has irq disabled and
+ * wait_split_huge_page() would never return as the
+ * tlb flush IPI wouldn't run.
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_numa(pmd))
+ return 0;
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
return 0;
} else {
@@ -183,8 +212,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
head = pte_page(pte);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
- VM_BUG_ON(compound_head(page) != head);
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
+ if (PageTail(page))
+ get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index b4996266210..4500142bc4a 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,6 +1,7 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/swap.h> /* for totalram_pages */
+#include <linux/bootmem.h>
void *kmap(struct page *page)
{
@@ -45,16 +46,17 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
+ arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
EXPORT_SYMBOL(kmap_atomic_prot);
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
{
return kmap_atomic_prot(page, kmap_prot);
}
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
/*
* This is the same as kmap_atomic() but can map memory that doesn't
@@ -88,6 +90,7 @@ void __kunmap_atomic(void *kvaddr)
*/
kpte_clear_flush(kmap_pte-idx, vaddr);
kmap_atomic_idx_pop();
+ arch_flush_lazy_mmu_mode();
}
#ifdef CONFIG_DEBUG_HIGHMEM
else {
@@ -119,6 +122,11 @@ void __init set_highmem_pages_init(void)
struct zone *zone;
int nid;
+ /*
+ * Explicitly reset zone->managed_pages because set_highmem_pages_init()
+ * is invoked before free_all_bootmem()
+ */
+ reset_all_zones_managed_pages();
for_each_zone(zone) {
unsigned long zone_start_pfn, zone_end_pfn;
@@ -135,5 +143,4 @@ void __init set_highmem_pages_init(void)
add_highpages_with_active_regions(nid, zone_start_pfn,
zone_end_pfn);
}
- totalram_pages += totalhigh_pages;
}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 069ce7c37c0..8b977ebf938 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -16,159 +16,6 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
-static unsigned long page_table_shareable(struct vm_area_struct *svma,
- struct vm_area_struct *vma,
- unsigned long addr, pgoff_t idx)
-{
- unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
- svma->vm_start;
- unsigned long sbase = saddr & PUD_MASK;
- unsigned long s_end = sbase + PUD_SIZE;
-
- /* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
-
- /*
- * match the virtual addresses, permission and the alignment of the
- * page table page.
- */
- if (pmd_index(addr) != pmd_index(saddr) ||
- vm_flags != svm_flags ||
- sbase < svma->vm_start || svma->vm_end < s_end)
- return 0;
-
- return saddr;
-}
-
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
- unsigned long base = addr & PUD_MASK;
- unsigned long end = base + PUD_SIZE;
-
- /*
- * check on proper vm_flags and page table alignment
- */
- if (vma->vm_flags & VM_MAYSHARE &&
- vma->vm_start <= base && end <= vma->vm_end)
- return 1;
- return 0;
-}
-
-/*
- * search for a shareable pmd page for hugetlb.
- */
-static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
-{
- struct vm_area_struct *vma = find_vma(mm, addr);
- struct address_space *mapping = vma->vm_file->f_mapping;
- pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- struct prio_tree_iter iter;
- struct vm_area_struct *svma;
- unsigned long saddr;
- pte_t *spte = NULL;
-
- if (!vma_shareable(vma, addr))
- return;
-
- spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
- if (svma == vma)
- continue;
-
- saddr = page_table_shareable(svma, vma, addr, idx);
- if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr);
- if (spte) {
- get_page(virt_to_page(spte));
- break;
- }
- }
- }
-
- if (!spte)
- goto out;
-
- spin_lock(&mm->page_table_lock);
- if (pud_none(*pud))
- pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
- else
- put_page(virt_to_page(spte));
- spin_unlock(&mm->page_table_lock);
-out:
- spin_unlock(&mapping->i_mmap_lock);
-}
-
-/*
- * unmap huge page backed by shared pte.
- *
- * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
- * called with vma->vm_mm->page_table_lock held.
- *
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
- */
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- pgd_t *pgd = pgd_offset(mm, *addr);
- pud_t *pud = pud_offset(pgd, *addr);
-
- BUG_ON(page_count(virt_to_page(ptep)) == 0);
- if (page_count(virt_to_page(ptep)) == 1)
- return 0;
-
- pud_clear(pud);
- put_page(virt_to_page(ptep));
- *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
- return 1;
-}
-
-pte_t *huge_pte_alloc(struct mm_struct *mm,
- unsigned long addr, unsigned long sz)
-{
- pgd_t *pgd;
- pud_t *pud;
- pte_t *pte = NULL;
-
- pgd = pgd_offset(mm, addr);
- pud = pud_alloc(mm, pgd, addr);
- if (pud) {
- if (sz == PUD_SIZE) {
- pte = (pte_t *)pud;
- } else {
- BUG_ON(sz != PMD_SIZE);
- if (pud_none(*pud))
- huge_pmd_share(mm, addr, pud);
- pte = (pte_t *) pmd_alloc(mm, pud, addr);
- }
- }
- BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
-
- return pte;
-}
-
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd = NULL;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_present(*pgd)) {
- pud = pud_offset(pgd, addr);
- if (pud_present(*pud)) {
- if (pud_large(*pud))
- return (pte_t *)pud;
- pmd = pmd_offset(pud, addr);
- }
- }
- return (pte_t *) pmd;
-}
-
#if 0 /* This is just for testing */
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
@@ -211,7 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
{
return NULL;
}
-
#else
struct page *
@@ -229,77 +75,23 @@ int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
-
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
-{
- struct page *page;
-
- page = pte_page(*(pte_t *)pmd);
- if (page)
- page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
- return page;
-}
-
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- struct page *page;
-
- page = pte_page(*(pte_t *)pud);
- if (page)
- page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
- return page;
-}
-
#endif
-/* x86_64 also uses this file */
-
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long start_addr;
-
- if (len > mm->cached_hole_size) {
- start_addr = mm->free_area_cache;
- } else {
- start_addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- }
-
-full_search:
- addr = ALIGN(start_addr, huge_page_size(h));
+ struct vm_unmapped_area_info info;
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != TASK_UNMAPPED_BASE) {
- start_addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = ALIGN(vma->vm_end, huge_page_size(h));
- }
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = current->mm->mmap_legacy_base;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
}
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -307,87 +99,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev_vma;
- unsigned long base = mm->mmap_base, addr = addr0;
- unsigned long largest_hole = mm->cached_hole_size;
- int first_time = 1;
-
- /* don't allow allocations above current base */
- if (mm->free_area_cache > base)
- mm->free_area_cache = base;
+ struct vm_unmapped_area_info info;
+ unsigned long addr;
- if (len <= largest_hole) {
- largest_hole = 0;
- mm->free_area_cache = base;
- }
-try_again:
- /* make sure it can fit in the remaining address space */
- if (mm->free_area_cache < len)
- goto fail;
-
- /* either no address requested or cant fit in requested address hole */
- addr = (mm->free_area_cache - len) & huge_page_mask(h);
- do {
- /*
- * Lookup failure means no vma is above this address,
- * i.e. return with success:
- */
- if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
- return addr;
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = current->mm->mmap_base;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
- /*
- * new region fits between prev_vma->vm_end and
- * vma->vm_start, use it:
- */
- if (addr + len <= vma->vm_start &&
- (!prev_vma || (addr >= prev_vma->vm_end))) {
- /* remember the address as a hint for next time */
- mm->cached_hole_size = largest_hole;
- return (mm->free_area_cache = addr);
- } else {
- /* pull free_area_cache down to the first hole */
- if (mm->free_area_cache == vma->vm_end) {
- mm->free_area_cache = vma->vm_start;
- mm->cached_hole_size = largest_hole;
- }
- }
-
- /* remember the largest hole we saw so far */
- if (addr + largest_hole < vma->vm_start)
- largest_hole = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = (vma->vm_start - len) & huge_page_mask(h);
- } while (len <= vma->vm_start);
-
-fail:
- /*
- * if hint left us with no space for the requested
- * mapping then try again:
- */
- if (first_time) {
- mm->free_area_cache = base;
- largest_hole = 0;
- first_time = 0;
- goto try_again;
- }
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = ~0UL;
- addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
- len, pgoff, flags);
-
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = base;
- mm->cached_hole_size = ~0UL;
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
return addr;
}
@@ -425,8 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
}
-
-#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a13de7..f9713061811 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -3,6 +3,7 @@
#include <linux/ioport.h>
#include <linux/swap.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h> /* for max_low_pfn */
#include <asm/cacheflush.h>
#include <asm/e820.h>
@@ -11,83 +12,103 @@
#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/setup.h>
-#include <asm/system.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/proto.h>
+#include <asm/dma.h> /* for MAX_DMA_PFN */
+#include <asm/microcode.h>
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+#include "mm_internal.h"
-unsigned long __initdata e820_table_start;
-unsigned long __meminitdata e820_table_end;
-unsigned long __meminitdata e820_table_top;
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
-int after_bootmem;
+static unsigned long min_pfn_mapped;
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
+static bool __initdata can_use_brk_pgt = true;
-static void __init find_early_table_space(unsigned long end, int use_pse,
- int use_gbpages)
+/*
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
+ */
+__ref void *alloc_low_pages(unsigned int num)
{
- unsigned long puds, pmds, ptes, tables, start;
- phys_addr_t base;
+ unsigned long pfn;
+ int i;
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ if (after_bootmem) {
+ unsigned int order;
- if (use_gbpages) {
- unsigned long extra;
+ order = get_order((unsigned long)num << PAGE_SHIFT);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+ __GFP_ZERO, order);
+ }
- extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
- pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+ if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
+ unsigned long ret;
+ if (min_pfn_mapped >= max_pfn_mapped)
+ panic("alloc_low_pages: ran out of memory");
+ ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ max_pfn_mapped << PAGE_SHIFT,
+ PAGE_SIZE * num , PAGE_SIZE);
+ if (!ret)
+ panic("alloc_low_pages: can not alloc memory");
+ memblock_reserve(ret, PAGE_SIZE * num);
+ pfn = ret >> PAGE_SHIFT;
+ } else {
+ pfn = pgt_buf_end;
+ pgt_buf_end += num;
+ printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+ pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
+ }
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+ for (i = 0; i < num; i++) {
+ void *adr;
- if (use_pse) {
- unsigned long extra;
+ adr = __va((pfn + i) << PAGE_SHIFT);
+ clear_page(adr);
+ }
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
- extra += PMD_SIZE;
-#endif
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ return __va(pfn << PAGE_SHIFT);
+}
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void __init early_alloc_pgt_buf(void)
+{
+ unsigned long tables = INIT_PGT_BUF_SIZE;
+ phys_addr_t base;
-#ifdef CONFIG_X86_32
- /* for fixmap */
- tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
+ base = __pa(extend_brk(tables, PAGE_SIZE));
- /*
- * RED-PEN putting page tables only on node 0 could
- * cause a hotspot and fill up ZONE_DMA. The page tables
- * need roughly 0.5KB per GB.
- */
-#ifdef CONFIG_X86_32
- start = 0x7000;
-#else
- start = 0x8000;
-#endif
- base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT,
- tables, PAGE_SIZE);
- if (base == MEMBLOCK_ERROR)
- panic("Cannot find space for the kernel page tables");
+ pgt_buf_start = base >> PAGE_SHIFT;
+ pgt_buf_end = pgt_buf_start;
+ pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
+
+int after_bootmem;
- e820_table_start = base >> PAGE_SHIFT;
- e820_table_end = e820_table_start;
- e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+static void __init init_gbpages(void)
+{
+#ifdef CONFIG_X86_64
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+#endif
}
struct map_range {
@@ -96,6 +117,35 @@ struct map_range {
unsigned page_size_mask;
};
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
+{
+ init_gbpages();
+
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ if (direct_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (cpu_has_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+}
+
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
@@ -119,57 +169,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
}
/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
+ * adjust the page_size_mask for small range to go with
+ * big page size instead small one if nearby are ram too.
*/
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+ int nr_range)
{
- unsigned long page_size_mask = 0;
- unsigned long start_pfn, end_pfn;
- unsigned long ret = 0;
- unsigned long pos;
+ int i;
- struct map_range mr[NR_RANGE_MR];
- int nr_range, i;
- int use_pse, use_gbpages;
+ for (i = 0; i < nr_range; i++) {
+ if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+ unsigned long start = round_down(mr[i].start, PMD_SIZE);
+ unsigned long end = round_up(mr[i].end, PMD_SIZE);
- printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- use_pse = use_gbpages = 0;
-#else
- use_pse = cpu_has_pse;
- use_gbpages = direct_gbpages;
+#ifdef CONFIG_X86_32
+ if ((end >> PAGE_SHIFT) > max_low_pfn)
+ continue;
#endif
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
-
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+ }
+ if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+ unsigned long start = round_down(mr[i].start, PUD_SIZE);
+ unsigned long end = round_up(mr[i].end, PUD_SIZE);
+
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+ }
}
+}
- if (use_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
- if (use_pse)
- page_size_mask |= 1 << PG_LEVEL_2M;
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long start_pfn, end_pfn, limit_pfn;
+ unsigned long pfn;
+ int i;
- memset(mr, 0, sizeof(mr));
- nr_range = 0;
+ limit_pfn = PFN_DOWN(end);
/* head if not big page alignment ? */
- start_pfn = start >> PAGE_SHIFT;
- pos = start_pfn << PAGE_SHIFT;
+ pfn = start_pfn = PFN_DOWN(start);
#ifdef CONFIG_X86_32
/*
* Don't use a large page for the first 2/4MB of memory
@@ -177,68 +221,65 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* and overlapping MTRRs into large pages can cause
* slowdowns.
*/
- if (pos == 0)
- end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ if (pfn == 0)
+ end_pfn = PFN_DOWN(PMD_SIZE);
else
- end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#endif
- if (end_pfn > (end >> PAGE_SHIFT))
- end_pfn = end >> PAGE_SHIFT;
+ if (end_pfn > limit_pfn)
+ end_pfn = limit_pfn;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* big page (2M) range */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#ifdef CONFIG_X86_64
/* big page (1G) range */
- start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* tail is not big page (1G) alignment */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#endif
/* tail is not big page (2M) alignment */
- start_pfn = pos>>PAGE_SHIFT;
- end_pfn = end>>PAGE_SHIFT;
+ start_pfn = pfn;
+ end_pfn = limit_pfn;
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ if (!after_bootmem)
+ adjust_range_page_size_mask(mr, nr_range);
+
/* try to merge same page size and continuous */
for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
unsigned long old_start;
@@ -254,62 +295,279 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
}
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG " %010lx - %010lx page %s\n",
- mr[i].start, mr[i].end,
+ printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+ mr[i].start, mr[i].end - 1,
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
- /*
- * Find space for the kernel direct mapping tables.
- *
- * Later we should allocate these tables in the local node of the
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
- if (!after_bootmem)
- find_early_table_space(end, use_pse, use_gbpages);
+ return nr_range;
+}
+
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+ nr_pfn_mapped, start_pfn, end_pfn);
+ nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+ max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+ if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+ max_low_pfn_mapped = max(max_low_pfn_mapped,
+ min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_pfn_mapped; i++)
+ if ((start_pfn >= pfn_mapped[i].start) &&
+ (end_pfn <= pfn_mapped[i].end))
+ return true;
+
+ return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ struct map_range mr[NR_RANGE_MR];
+ unsigned long ret = 0;
+ int nr_range, i;
+
+ pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+ start, end - 1);
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = split_mem_range(mr, 0, start, end);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
-#ifdef CONFIG_X86_32
- early_ioremap_page_table_range_init();
+ add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
- load_cr3(swapper_pg_dir);
-#endif
+ return ret >> PAGE_SHIFT;
+}
-#ifdef CONFIG_X86_64
- if (!after_bootmem && !start) {
- pud_t *pud;
- pmd_t *pmd;
+/*
+ * We need to iterate through the E820 memory map and create direct mappings
+ * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply
+ * create direct mappings for all pfns from [0 to max_low_pfn) and
+ * [4GB to max_pfn) because of possible memory holes in high addresses
+ * that cannot be marked as UC by fixed/variable range MTRRs.
+ * Depending on the alignment of E820 ranges, this may possibly result
+ * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
+ *
+ * init_mem_mapping() calls init_range_memory_mapping() with big range.
+ * That range would have hole in the middle or ends, and only ram parts
+ * will be mapped in init_range_memory_mapping().
+ */
+static unsigned long __init init_range_memory_mapping(
+ unsigned long r_start,
+ unsigned long r_end)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long mapped_ram_size = 0;
+ int i;
- mmu_cr4_features = read_cr4();
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+ u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+ if (start >= end)
+ continue;
/*
- * _brk_end cannot change anymore, but it and _end may be
- * located on different 2M pages. cleanup_highmap(), however,
- * can only consider _end when it runs, so destroy any
- * mappings beyond _brk_end here.
+ * if it is overlapping with brk pgt, we need to
+ * alloc pgt buf from memblock instead.
*/
- pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
- pmd = pmd_offset(pud, _brk_end - 1);
- while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
- pmd_clear(pmd);
+ can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
+ min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
+ init_memory_mapping(start, end);
+ mapped_ram_size += end - start;
+ can_use_brk_pgt = true;
}
-#endif
- __flush_tlb_all();
- if (!after_bootmem && e820_table_end > e820_table_start)
- memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
- e820_table_end << PAGE_SHIFT, "PGTABLE");
+ return mapped_ram_size;
+}
- if (!after_bootmem)
- early_memtest(start, end);
+static unsigned long __init get_new_step_size(unsigned long step_size)
+{
+ /*
+ * Explain why we shift by 5 and why we don't have to worry about
+ * 'step_size << 5' overflowing:
+ *
+ * initial mapped size is PMD_SIZE (2M).
+ * We can not set step_size to be PUD_SIZE (1G) yet.
+ * In worse case, when we cross the 1G boundary, and
+ * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
+ * to map 1G range with PTE. Use 5 as shift for now.
+ *
+ * Don't need to worry about overflow, on 32bit, when step_size
+ * is 0, round_down() returns 0 for start, and that turns it
+ * into 0x100000000ULL.
+ */
+ return step_size << 5;
+}
- return ret >> PAGE_SHIFT;
+/**
+ * memory_map_top_down - Map [map_start, map_end) top down
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in top-down. That said, the page tables
+ * will be allocated at the end of the memory, and we map the
+ * memory in top-down.
+ */
+static void __init memory_map_top_down(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long real_end, start, last_start;
+ unsigned long step_size;
+ unsigned long addr;
+ unsigned long mapped_ram_size = 0;
+ unsigned long new_mapped_ram_size;
+
+ /* xen has big range in reserved near end of ram, skip it at first.*/
+ addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+ real_end = addr + PMD_SIZE;
+
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ step_size = PMD_SIZE;
+ max_pfn_mapped = 0; /* will get exact value next */
+ min_pfn_mapped = real_end >> PAGE_SHIFT;
+ last_start = start = real_end;
+
+ /*
+ * We start from the top (end of memory) and go to the bottom.
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (last_start > map_start) {
+ if (last_start > step_size) {
+ start = round_down(last_start - 1, step_size);
+ if (start < map_start)
+ start = map_start;
+ } else
+ start = map_start;
+ new_mapped_ram_size = init_range_memory_mapping(start,
+ last_start);
+ last_start = start;
+ min_pfn_mapped = last_start >> PAGE_SHIFT;
+ /* only increase step_size after big range get mapped */
+ if (new_mapped_ram_size > mapped_ram_size)
+ step_size = get_new_step_size(step_size);
+ mapped_ram_size += new_mapped_ram_size;
+ }
+
+ if (real_end < map_end)
+ init_range_memory_mapping(real_end, map_end);
}
+/**
+ * memory_map_bottom_up - Map [map_start, map_end) bottom up
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in bottom-up. Since we have limited the
+ * bottom-up allocation above the kernel, the page tables will
+ * be allocated just above the kernel and we map the memory
+ * in [map_start, map_end) in bottom-up.
+ */
+static void __init memory_map_bottom_up(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long next, new_mapped_ram_size, start;
+ unsigned long mapped_ram_size = 0;
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ unsigned long step_size = PMD_SIZE;
+
+ start = map_start;
+ min_pfn_mapped = start >> PAGE_SHIFT;
+
+ /*
+ * We start from the bottom (@map_start) and go to the top (@map_end).
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (start < map_end) {
+ if (map_end - start > step_size) {
+ next = round_up(start + 1, step_size);
+ if (next > map_end)
+ next = map_end;
+ } else
+ next = map_end;
+
+ new_mapped_ram_size = init_range_memory_mapping(start, next);
+ start = next;
+
+ if (new_mapped_ram_size > mapped_ram_size)
+ step_size = get_new_step_size(step_size);
+ mapped_ram_size += new_mapped_ram_size;
+ }
+}
+
+void __init init_mem_mapping(void)
+{
+ unsigned long end;
+
+ probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+ end = max_pfn << PAGE_SHIFT;
+#else
+ end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+ /* the ISA range is always mapped regardless of memory holes */
+ init_memory_mapping(0, ISA_END_ADDRESS);
+
+ /*
+ * If the allocation is in bottom-up direction, we setup direct mapping
+ * in bottom-up, otherwise we setup direct mapping in top-down.
+ */
+ if (memblock_bottom_up()) {
+ unsigned long kernel_end = __pa_symbol(_end);
+
+ /*
+ * we need two separate calls here. This is because we want to
+ * allocate page tables above the kernel. So we first map
+ * [kernel_end, end) to make memory above the kernel be mapped
+ * as soon as possible. And then use page tables allocated above
+ * the kernel to map [ISA_END_ADDRESS, kernel_end).
+ */
+ memory_map_bottom_up(kernel_end, end);
+ memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
+ } else {
+ memory_map_top_down(ISA_END_ADDRESS, end);
+ }
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
+ }
+#else
+ early_ioremap_page_table_range_init();
+#endif
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+
+ early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
/*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
@@ -323,7 +581,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
*/
int devmem_is_allowed(unsigned long pagenr)
{
- if (pagenr <= 256)
+ if (pagenr < 256)
return 1;
if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
return 0;
@@ -334,7 +592,6 @@ int devmem_is_allowed(unsigned long pagenr)
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
- unsigned long addr;
unsigned long begin_aligned, end_aligned;
/* Make sure boundaries are page aligned */
@@ -349,47 +606,47 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
if (begin >= end)
return;
- addr = begin;
-
/*
* If debugging page accesses then do not free this memory but
* mark them not present - any buggy init-section access will
* create a kernel page fault:
*/
#ifdef CONFIG_DEBUG_PAGEALLOC
- printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
- begin, end);
+ printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
+ begin, end - 1);
set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
/*
* We just marked the kernel text read only above, now that
* we are going to free part of that, we need to make that
- * writeable first.
+ * writeable and non-executable first.
*/
+ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
- printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-
- for (; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- free_page(addr);
- totalram_pages++;
- }
+ free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
#endif
}
void free_initmem(void)
{
- free_init_pages("unused kernel memory",
+ free_init_pages("unused kernel",
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
}
#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
+void __init free_initrd_mem(unsigned long start, unsigned long end)
{
+#ifdef CONFIG_MICROCODE_EARLY
+ /*
+ * Remember, initrd memory may contain microcode or other useful things.
+ * Before we lose initrd mem, we need to find a place to hold them
+ * now that normal virtual memory is enabled.
+ */
+ save_microcode_in_initrd();
+#endif
+
/*
* end could be not aligned, and We can not align that,
* decompresser could be confused by aligned initrd_end
@@ -399,6 +656,27 @@ void free_initrd_mem(unsigned long start, unsigned long end)
* - relocate_initrd()
* So here We can do PAGE_ALIGN() safely to get partial page to be freed
*/
- free_init_pages("initrd memory", start, PAGE_ALIGN(end));
+ free_init_pages("initrd", start, PAGE_ALIGN(end));
}
#endif
+
+void __init zone_sizes_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+#endif
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
+#endif
+
+ free_area_init_nodes(max_zone_pfns);
+}
+
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401..e39504878ae 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -35,7 +35,6 @@
#include <asm/asm.h>
#include <asm/bios_ebda.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/dma.h>
@@ -45,6 +44,7 @@
#include <asm/bugs.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
@@ -53,25 +53,14 @@
#include <asm/page_types.h>
#include <asm/init.h>
+#include "mm_internal.h"
+
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
bool __read_mostly __vmalloc_start_set = false;
-static __init void *alloc_low_page(void)
-{
- unsigned long pfn = e820_table_end++;
- void *adr;
-
- if (pfn >= e820_table_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = __va(pfn * PAGE_SIZE);
- clear_page(adr);
- return adr;
-}
-
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
@@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_bootmem)
- pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
- else
- pmd_table = (pmd_t *)alloc_low_page();
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
@@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
static pte_t * __init one_page_table_init(pmd_t *pmd)
{
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
- pte_t *page_table = NULL;
-
- if (after_bootmem) {
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
-#endif
- if (!page_table)
- page_table =
- (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
- } else
- page_table = (pte_t *)alloc_low_page();
+ pte_t *page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
return one_page_table_init(pmd) + pte_idx;
}
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+ unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+
+ if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+ return 0;
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd_idx++) {
+ if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+ (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+ count++;
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+#endif
+ return count;
+}
+
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
- unsigned long vaddr, pte_t *lastpte)
+ unsigned long vaddr, pte_t *lastpte,
+ void **adr)
{
#ifdef CONFIG_HIGHMEM
/*
@@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
- && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
- || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
pte_t *newpte;
int i;
BUG_ON(after_bootmem);
- newpte = alloc_low_page();
+ newpte = *adr;
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
+ *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte = NULL;
+ unsigned long count = page_table_range_init_count(start, end);
+ void *adr = NULL;
+
+ if (count)
+ adr = alloc_low_pages(count);
vaddr = start;
pgd_idx = pgd_index(vaddr);
@@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
pte = page_table_kmap_check(one_page_table_init(pmd),
- pmd, vaddr, pte);
+ pmd, vaddr, pte, &adr);
vaddr += PMD_SIZE;
}
@@ -226,7 +237,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
static inline int is_kernel_text(unsigned long addr)
{
- if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+ if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
return 1;
return 0;
}
@@ -310,6 +321,7 @@ repeat:
__pgprot(PTE_IDENT_ATTR |
_PAGE_PSE);
+ pfn &= PMD_MASK >> PAGE_SHIFT;
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
@@ -415,34 +427,20 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pkmap_page_table = pte;
}
-static void __init add_one_highpage_init(struct page *page)
-{
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
-}
-
void __init add_highpages_with_active_regions(int nid,
unsigned long start_pfn, unsigned long end_pfn)
{
- struct range *range;
- int nr_range;
- int i;
-
- nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
-
- for (i = 0; i < nr_range; i++) {
- struct page *page;
- int node_pfn;
-
- for (node_pfn = range[i].start; node_pfn < range[i].end;
- node_pfn++) {
- if (!pfn_valid(node_pfn))
- continue;
- page = pfn_to_page(node_pfn);
- add_one_highpage_init(page);
- }
+ phys_addr_t start, end;
+ u64 i;
+
+ for_each_free_mem_range(i, nid, &start, &end, NULL) {
+ unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
+ start_pfn, end_pfn);
+ unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
+ start_pfn, end_pfn);
+ for ( ; pfn < e_pfn; pfn++)
+ if (pfn_valid(pfn))
+ free_highmem_page(pfn_to_page(pfn));
}
}
#else
@@ -451,19 +449,24 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base)
}
#endif /* CONFIG_HIGHMEM */
-void __init native_pagetable_setup_start(pgd_t *base)
+void __init native_pagetable_init(void)
{
unsigned long pfn, va;
- pgd_t *pgd;
+ pgd_t *pgd, *base = swapper_pg_dir;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
/*
* Remove any mappings which extend past the end of physical
- * memory from the boot time page table:
+ * memory from the boot time page table.
+ * In virtual address space, we should have at least two pages
+ * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+ * definition. And max_low_pfn is set to VMALLOC_END physical
+ * address. If initial memory mapping is doing right job, we
+ * should have pte used near max_low_pfn or one pmd is not present.
*/
- for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
pgd = base + pgd_index(va);
if (!pgd_present(*pgd))
@@ -474,17 +477,23 @@ void __init native_pagetable_setup_start(pgd_t *base)
if (!pmd_present(*pmd))
break;
+ /* should not be large page here */
+ if (pmd_large(*pmd)) {
+ pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+ pfn, pmd, __pa(pmd));
+ BUG_ON(1);
+ }
+
pte = pte_offset_kernel(pmd, va);
if (!pte_present(*pte))
break;
+ printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+ pfn, pmd, __pa(pmd), pte, __pa(pte));
pte_clear(NULL, va, pte);
}
paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
-}
-
-void __init native_pagetable_setup_done(pgd_t *base)
-{
+ paging_init();
}
/*
@@ -499,7 +508,7 @@ void __init native_pagetable_setup_done(pgd_t *base)
* If we're booting paravirtualized under a hypervisor, then there are
* more options: we may already be running PAE, and the pagetable may
* or may not be based in swapper_pg_dir. In any case,
- * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * paravirt_pagetable_init() will set up swapper_pg_dir
* appropriately for the rest of the initialization to work.
*
* In general, pagetable_init() assumes that the pagetable may already
@@ -559,7 +568,7 @@ early_param("highmem", parse_highmem);
* artificially via the highmem=x boot parameter then create
* it:
*/
-void __init lowmem_pfn_init(void)
+static void __init lowmem_pfn_init(void)
{
/* max_low_pfn is 0, we already have early_res support */
max_low_pfn = max_pfn;
@@ -595,7 +604,7 @@ void __init lowmem_pfn_init(void)
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
*/
-void __init highmem_pfn_init(void)
+static void __init highmem_pfn_init(void)
{
max_low_pfn = MAXMEM_PFN;
@@ -643,27 +652,24 @@ void __init find_low_pfn_range(void)
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
- int acpi, int k8)
+void __init initmem_init(void)
{
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
- memblock_x86_register_active_regions(0, 0, highend_pfn);
- sparse_memory_present_with_active_regions(0);
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
- num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- memblock_x86_register_active_regions(0, 0, max_low_pfn);
- sparse_memory_present_with_active_regions(0);
- num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
+
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
+ sparse_memory_present_with_active_regions(0);
+
#ifdef CONFIG_FLATMEM
- max_mapnr = num_physpages;
+ max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
#endif
__vmalloc_start_set = true;
@@ -674,27 +680,11 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
}
#endif /* !CONFIG_NEED_MULTIPLE_NODES */
-static void __init zone_sizes_init(void)
-{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] =
- virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
- max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-#endif
-
- free_area_init_nodes(max_zone_pfns);
-}
-
void __init setup_bootmem_allocator(void)
{
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-
- after_bootmem = 1;
}
/*
@@ -715,6 +705,8 @@ void __init paging_init(void)
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
+ olpc_dt_build_devicetree();
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_sizes_init();
}
@@ -731,16 +723,13 @@ static void __init test_wp_bit(void)
"Checking if this processor honours the WP bit even in supervisor mode...");
/* Any page-aligned address will do, the test is non-destructive */
- __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
boot_cpu_data.wp_works_ok = do_test_wp_bit();
clear_fixmap(FIX_WP_TEST);
if (!boot_cpu_data.wp_works_ok) {
printk(KERN_CONT "No.\n");
-#ifdef CONFIG_X86_WP_WORKS_OK
- panic(
- "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
-#endif
+ panic("Linux doesn't support CPUs with broken WP.");
} else {
printk(KERN_CONT "Ok.\n");
}
@@ -748,41 +737,28 @@ static void __init test_wp_bit(void)
void __init mem_init(void)
{
- int codesize, reservedpages, datasize, initsize;
- int tmp;
-
pci_iommu_alloc();
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
- /* this will put all low memory onto the freelists */
- totalram_pages += free_all_bootmem();
-
- reservedpages = 0;
- for (tmp = 0; tmp < max_low_pfn; tmp++)
- /*
- * Only count reserved RAM pages:
- */
- if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
- reservedpages++;
-
+ /*
+ * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+ * be done before free_all_bootmem(). Memblock use free low memory for
+ * temporary data (see find_range_array()) and for this purpose can use
+ * pages that was already passed to the buddy allocator, hence marked as
+ * not accessible in the page tables when compiled with
+ * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+ * important here.
+ */
set_highmem_pages_init();
- codesize = (unsigned long) &_etext - (unsigned long) &_text;
- datasize = (unsigned long) &_edata - (unsigned long) &_etext;
- initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
+ /* this will put all low memory onto the freelists */
+ free_all_bootmem();
- printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
- "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
- nr_free_pages() << (PAGE_SHIFT-10),
- num_physpages << (PAGE_SHIFT-10),
- codesize >> 10,
- reservedpages << (PAGE_SHIFT-10),
- datasize >> 10,
- initsize >> 10,
- totalhigh_pages << (PAGE_SHIFT-10));
+ after_bootmem = 1;
+ mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
@@ -830,6 +806,9 @@ void __init mem_init(void)
BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
#undef high_memory
#undef __FIXADDR_TOP
+#ifdef CONFIG_RANDOMIZE_BASE
+ BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
+#endif
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
@@ -852,6 +831,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
return __add_pages(nid, zone, start_pfn, nr_pages);
}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
#endif
/*
@@ -912,6 +903,23 @@ void set_kernel_text_ro(void)
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
}
+static void mark_nxdata_nx(void)
+{
+ /*
+ * When this called, init has already been executed and released,
+ * so everything past _etext should be NX.
+ */
+ unsigned long start = PFN_ALIGN(_etext);
+ /*
+ * This comes from is_kernel_text upper limit. Also HPAGE where used:
+ */
+ unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+ if (__supported_pte_mask & _PAGE_NX)
+ printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+ set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +954,7 @@ void mark_rodata_ro(void)
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
+ mark_nxdata_nx();
}
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a59296af8..df1a9927ad2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -28,13 +28,14 @@
#include <linux/poison.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
+#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/nmi.h>
#include <linux/gfp.h>
+#include <linux/kcore.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -51,6 +52,84 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
#include <asm/init.h>
+#include <asm/uv/uv.h>
+#include <asm/setup.h>
+
+#include "mm_internal.h"
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+ unsigned long addr, unsigned long end)
+{
+ addr &= PMD_MASK;
+ for (; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd = pmd_page + pmd_index(addr);
+
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(addr | pmd_flag));
+ }
+}
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next) {
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pud_present(*pud)) {
+ pmd = pmd_offset(pud, 0);
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ continue;
+ }
+ pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+ if (!pmd)
+ return -ENOMEM;
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ int result;
+ int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+ for (; addr < end; addr = next) {
+ pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+ pud_t *pud;
+
+ next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, 0);
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ continue;
+ }
+
+ pud = (pud_t *)info->alloc_pgt_page(info->context);
+ if (!pud)
+ return -ENOMEM;
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
static int __init parse_direct_gbpages_off(char *arg)
{
@@ -105,18 +184,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
for (address = start; address <= end; address += PGDIR_SIZE) {
const pgd_t *pgd_ref = pgd_offset_k(address);
- unsigned long flags;
struct page *page;
if (pgd_none(*pgd_ref))
continue;
- spin_lock_irqsave(&pgd_lock, flags);
+ spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
spinlock_t *pgt_lock;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ /* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
@@ -128,7 +207,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(pgt_lock);
}
- spin_unlock_irqrestore(&pgd_lock, flags);
+ spin_unlock(&pgd_lock);
}
}
@@ -289,22 +368,30 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
*
* from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
*
- * phys_addr holds the negative offset to the kernel, which is added
+ * phys_base holds the negative offset to the kernel, which is added
* to the compile time generated pmds. This results in invalid pmds up
* to the point where we hit the physaddr 0 mapping.
*
- * We limit the mappings to the region from _text to _end. _end is
- * rounded up to the 2MB boundary. This catches the invalid pmds as
+ * We limit the mappings to the region from _text to _brk_end. _brk_end
+ * is rounded up to the 2MB boundary. This catches the invalid pmds as
* well, as they are located before _text:
*/
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
- unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
+ unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
+ unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
- pmd_t *last_pmd = pmd + PTRS_PER_PMD;
- for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
+ /*
+ * Native path, max_pfn_mapped is not set yet.
+ * Xen has valid max_pfn_mapped set in
+ * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
+ */
+ if (max_pfn_mapped)
+ vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+
+ for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
if (pmd_none(*pmd))
continue;
if (vaddr < (unsigned long) _text || vaddr > end)
@@ -312,53 +399,24 @@ void __init cleanup_highmap(void)
}
}
-static __ref void *alloc_low_page(unsigned long *phys)
-{
- unsigned long pfn = e820_table_end++;
- void *adr;
-
- if (after_bootmem) {
- adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
- *phys = __pa(adr);
-
- return adr;
- }
-
- if (pfn >= e820_table_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
- clear_page(adr);
- *phys = pfn * PAGE_SIZE;
- return adr;
-}
-
-static __ref void unmap_low_page(void *adr)
-{
- if (after_bootmem)
- return;
-
- early_iounmap(adr, PAGE_SIZE);
-}
-
static unsigned long __meminit
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
pgprot_t prot)
{
- unsigned pages = 0;
+ unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i;
pte_t *pte = pte_page + pte_index(addr);
- for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
-
+ for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+ next = (addr & PAGE_MASK) + PAGE_SIZE;
if (addr >= end) {
- if (!after_bootmem) {
- for(; i < PTRS_PER_PTE; i++, pte++)
- set_pte(pte, __pte(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+ set_pte(pte, __pte(0));
+ continue;
}
/*
@@ -368,7 +426,8 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
* these mappings are more intelligent.
*/
if (pte_val(*pte)) {
- pages++;
+ if (!after_bootmem)
+ pages++;
continue;
}
@@ -386,41 +445,33 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
}
static unsigned long __meminit
-phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
- pgprot_t prot)
-{
- pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
-
- return phys_pte_init(pte, address, end, prot);
-}
-
-static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
unsigned long page_size_mask, pgprot_t prot)
{
- unsigned long pages = 0;
+ unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i = pmd_index(address);
- for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
- unsigned long pte_phys;
+ for (; i < PTRS_PER_PMD; i++, address = next) {
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
pgprot_t new_prot = prot;
+ next = (address & PMD_MASK) + PMD_SIZE;
if (address >= end) {
- if (!after_bootmem) {
- for (; i < PTRS_PER_PMD; i++, pmd++)
- set_pmd(pmd, __pmd(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+ set_pmd(pmd, __pmd(0));
+ continue;
}
if (pmd_val(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
- last_map_addr = phys_pte_update(pmd, address,
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
+ last_map_addr = phys_pte_init(pte, address,
end, prot);
spin_unlock(&init_mm.page_table_lock);
continue;
@@ -438,7 +489,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
* attributes.
*/
if (page_size_mask & (1 << PG_LEVEL_2M)) {
- pages++;
+ if (!after_bootmem)
+ pages++;
+ last_map_addr = next;
continue;
}
new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
@@ -448,19 +501,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte(address >> PAGE_SHIFT,
+ pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
- last_map_addr = (address & PMD_MASK) + PMD_SIZE;
+ last_map_addr = next;
continue;
}
- pte = alloc_low_page(&pte_phys);
+ pte = alloc_low_page();
last_map_addr = phys_pte_init(pte, address, end, new_prot);
- unmap_low_page(pte);
spin_lock(&init_mm.page_table_lock);
- pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+ pmd_populate_kernel(&init_mm, pmd, pte);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
@@ -468,44 +520,33 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
}
static unsigned long __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
- unsigned long page_size_mask, pgprot_t prot)
-{
- pmd_t *pmd = pmd_offset(pud, 0);
- unsigned long last_map_addr;
-
- last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
- __flush_tlb_all();
- return last_map_addr;
-}
-
-static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
unsigned long page_size_mask)
{
- unsigned long pages = 0;
+ unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i = pud_index(addr);
- for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
- unsigned long pmd_phys;
+ for (; i < PTRS_PER_PUD; i++, addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
- if (addr >= end)
- break;
-
- if (!after_bootmem &&
- !e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
- set_pud(pud, __pud(0));
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (addr >= end) {
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+ set_pud(pud, __pud(0));
continue;
}
if (pud_val(*pud)) {
if (!pud_large(*pud)) {
- last_map_addr = phys_pmd_update(pud, addr, end,
+ pmd = pmd_offset(pud, 0);
+ last_map_addr = phys_pmd_init(pmd, addr, end,
page_size_mask, prot);
+ __flush_tlb_all();
continue;
}
/*
@@ -521,7 +562,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
* attributes.
*/
if (page_size_mask & (1 << PG_LEVEL_1G)) {
- pages++;
+ if (!after_bootmem)
+ pages++;
+ last_map_addr = next;
continue;
}
prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
@@ -531,19 +574,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pud,
- pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE));
spin_unlock(&init_mm.page_table_lock);
- last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
+ last_map_addr = next;
continue;
}
- pmd = alloc_low_page(&pmd_phys);
+ pmd = alloc_low_page();
last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
prot);
- unmap_low_page(pmd);
spin_lock(&init_mm.page_table_lock);
- pud_populate(&init_mm, pud, __va(pmd_phys));
+ pud_populate(&init_mm, pud, pmd);
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
@@ -553,17 +596,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
return last_map_addr;
}
-static unsigned long __meminit
-phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
- unsigned long page_size_mask)
-{
- pud_t *pud;
-
- pud = (pud_t *)pgd_page_vaddr(*pgd);
-
- return phys_pud_init(pud, addr, end, page_size_mask);
-}
-
unsigned long __meminit
kernel_physical_mapping_init(unsigned long start,
unsigned long end,
@@ -579,32 +611,29 @@ kernel_physical_mapping_init(unsigned long start,
for (; start < end; start = next) {
pgd_t *pgd = pgd_offset_k(start);
- unsigned long pud_phys;
pud_t *pud;
- next = (start + PGDIR_SIZE) & PGDIR_MASK;
- if (next > end)
- next = end;
+ next = (start & PGDIR_MASK) + PGDIR_SIZE;
if (pgd_val(*pgd)) {
- last_map_addr = phys_pud_update(pgd, __pa(start),
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
+ last_map_addr = phys_pud_init(pud, __pa(start),
__pa(end), page_size_mask);
continue;
}
- pud = alloc_low_page(&pud_phys);
- last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+ pud = alloc_low_page();
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
page_size_mask);
- unmap_low_page(pud);
spin_lock(&init_mm.page_table_lock);
- pgd_populate(&init_mm, pgd, __va(pud_phys));
+ pgd_populate(&init_mm, pgd, pud);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
if (pgd_changed)
- sync_global_pgds(addr, end);
+ sync_global_pgds(addr, end - 1);
__flush_tlb_all();
@@ -612,22 +641,14 @@ kernel_physical_mapping_init(unsigned long start,
}
#ifndef CONFIG_NUMA
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
- int acpi, int k8)
+void __init initmem_init(void)
{
- memblock_x86_register_active_regions(0, start_pfn, end_pfn);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
}
#endif
void __init paging_init(void)
{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
-
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
- max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
- max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
@@ -637,9 +658,11 @@ void __init paging_init(void)
* numa support is not compiled in, and later node_set_state
* will not set it back.
*/
- node_clear_state(0, N_NORMAL_MEMORY);
+ node_clear_state(0, N_MEMORY);
+ if (N_MEMORY != N_NORMAL_MEMORY)
+ node_clear_state(0, N_NORMAL_MEMORY);
- free_area_init_nodes(max_zone_pfns);
+ zone_sizes_init();
}
/*
@@ -669,13 +692,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
- unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- last_mapped_pfn = init_memory_mapping(start, start + size);
- if (last_mapped_pfn > max_pfn_mapped)
- max_pfn_mapped = last_mapped_pfn;
+ init_memory_mapping(start, start + size);
ret = __add_pages(nid, zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
@@ -687,57 +708,357 @@ int arch_add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(arch_add_memory);
-#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
-int memory_add_physaddr_to_nid(u64 start)
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
{
- return 0;
+ unsigned long magic;
+ unsigned int nr_pages = 1 << order;
+
+ /* bootmem page has reserved flag */
+ if (PageReserved(page)) {
+ __ClearPageReserved(page);
+
+ magic = (unsigned long)page->lru.next;
+ if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+ while (nr_pages--)
+ put_page_bootmem(page++);
+ } else
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ free_pages((unsigned long)page_address(page), order);
}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (pte_val(*pte))
+ return;
+ }
+
+ /* free a pte talbe */
+ free_pagetable(pmd_page(*pmd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (pmd_val(*pmd))
+ return;
+ }
+
+ /* free a pmd talbe */
+ free_pagetable(pud_page(*pud), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (pud_val(*pud))
+ return false;
+ }
+
+ /* free a pud table */
+ free_pagetable(pgd_page(*pgd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pgd_clear(pgd);
+ spin_unlock(&init_mm.page_table_lock);
+
+ return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte;
+ void *page_addr;
+ phys_addr_t phys_addr;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ /*
+ * We mapped [0,1G) memory as identity mapping when
+ * initializing, in arch/x86/kernel/head_64.S. These
+ * pagetables cannot be removed.
+ */
+ phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+ if (phys_addr < (phys_addr_t)0x40000000)
+ return;
+
+ if (IS_ALIGNED(addr, PAGE_SIZE) &&
+ IS_ALIGNED(next, PAGE_SIZE)) {
+ /*
+ * Do not free direct mapping pages since they were
+ * freed when offlining, or simplely not in use.
+ */
+ if (!direct)
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+
+ /* For non-direct mapping, pages means nothing. */
+ pages++;
+ } else {
+ /*
+ * If we are here, we are freeing vmemmap pages since
+ * direct mapped memory ranges to be freed are aligned.
+ *
+ * If we are not removing the whole page, it means
+ * other page structs in this page are being used and
+ * we canot remove them. So fill the unused page_structs
+ * with 0xFD, and remove the page when it is wholly
+ * filled with 0xFD.
+ */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pte_page(*pte));
+ if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+ }
+
+ /* Call free_pte_table() in remove_pmd_table(). */
+ flush_tlb_all();
+ if (direct)
+ update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte_base;
+ pmd_t *pmd;
+ void *page_addr;
+
+ pmd = pmd_start + pmd_index(addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (pmd_large(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pmd_page(*pmd));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PMD_SIZE)) {
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+ remove_pte_table(pte_base, addr, next, direct);
+ free_pte_table(pte_base, pmd);
+ }
+
+ /* Call free_pmd_table() in remove_pud_table(). */
+ if (direct)
+ update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pmd_t *pmd_base;
+ pud_t *pud;
+ void *page_addr;
+
+ pud = pud_start + pud_index(addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ if (!direct)
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pud_page(*pud));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PUD_SIZE)) {
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+ remove_pmd_table(pmd_base, addr, next, direct);
+ free_pmd_table(pmd_base, pud);
+ }
+
+ if (direct)
+ update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ bool pgd_changed = false;
+
+ for (; start < end; start = next) {
+ next = pgd_addr_end(start, end);
+
+ pgd = pgd_offset_k(start);
+ if (!pgd_present(*pgd))
+ continue;
+
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
+ remove_pud_table(pud, start, next, direct);
+ if (free_pud_table(pud, pgd))
+ pgd_changed = true;
+ }
+
+ if (pgd_changed)
+ sync_global_pgds(start, end - 1);
+
+ flush_tlb_all();
+}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end)
+{
+ remove_pagetable(start, end, false);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void __meminit
+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
+{
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+
+ remove_pagetable(start, end, true);
+}
+
+int __ref arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+ int ret;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ kernel_physical_mapping_remove(start, start + size);
+ ret = __remove_pages(zone, start_pfn, nr_pages);
+ WARN_ON_ONCE(ret);
+
+ return ret;
+}
+#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
static struct kcore_list kcore_vsyscall;
-void __init mem_init(void)
+static void __init register_page_bootmem_info(void)
{
- long codesize, reservedpages, datasize, initsize;
- unsigned long absent_pages;
+#ifdef CONFIG_NUMA
+ int i;
+
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+void __init mem_init(void)
+{
pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
- reservedpages = 0;
+ register_page_bootmem_info();
- /* this will put all low memory onto the freelists */
-#ifdef CONFIG_NUMA
- totalram_pages = numa_free_all_bootmem();
-#else
- totalram_pages = free_all_bootmem();
-#endif
-
- absent_pages = absent_pages_in_range(0, max_pfn);
- reservedpages = max_pfn - totalram_pages - absent_pages;
+ /* this will put all memory onto the freelists */
+ free_all_bootmem();
after_bootmem = 1;
- codesize = (unsigned long) &_etext - (unsigned long) &_text;
- datasize = (unsigned long) &_edata - (unsigned long) &_etext;
- initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
-
/* Register memory areas for /proc/kcore */
- kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
- VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
+ PAGE_SIZE, KCORE_OTHER);
- printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
- "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
- nr_free_pages() << (PAGE_SHIFT-10),
- max_pfn << (PAGE_SHIFT-10),
- codesize >> 10,
- absent_pages << (PAGE_SHIFT-10),
- reservedpages << (PAGE_SHIFT-10),
- datasize >> 10,
- initsize >> 10);
+ mem_init_print_info(NULL);
}
#ifdef CONFIG_DEBUG_RODATA
@@ -785,12 +1106,11 @@ void set_kernel_text_ro(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
- unsigned long rodata_start =
- ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ unsigned long rodata_start = PFN_ALIGN(__start_rodata);
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
- unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
- unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
- unsigned long data_start = (unsigned long) &_sdata;
+ unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
+ unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+ unsigned long all_end = PFN_ALIGN(&_end);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -799,10 +1119,10 @@ void mark_rodata_ro(void)
kernel_set_to_readonly = 1;
/*
- * The rodata section (but not the kernel text!) should also be
- * not-executable.
+ * The rodata/data/bss/brk section (but not the kernel text!)
+ * should also be not-executable.
*/
- set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
rodata_test();
@@ -814,13 +1134,12 @@ void mark_rodata_ro(void)
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
- free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(text_end)),
- (unsigned long)
- page_address(virt_to_page(rodata_start)));
- free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(rodata_end)),
- (unsigned long) page_address(virt_to_page(data_start)));
+ free_init_pages("unused kernel",
+ (unsigned long) __va(__pa_symbol(text_end)),
+ (unsigned long) __va(__pa_symbol(rodata_start)));
+ free_init_pages("unused kernel",
+ (unsigned long) __va(__pa_symbol(rodata_end)),
+ (unsigned long) __va(__pa_symbol(_sdata)));
}
#endif
@@ -844,6 +1163,9 @@ int kern_addr_valid(unsigned long addr)
if (pud_none(*pud))
return 0;
+ if (pud_large(*pud))
+ return pfn_valid(pud_pfn(*pud));
+
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return 0;
@@ -863,25 +1185,33 @@ int kern_addr_valid(unsigned long addr)
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
* not need special handling anymore:
*/
+static const char *gate_vma_name(struct vm_area_struct *vma)
+{
+ return "[vsyscall]";
+}
+static struct vm_operations_struct gate_vma_ops = {
+ .name = gate_vma_name,
+};
static struct vm_area_struct gate_vma = {
- .vm_start = VSYSCALL_START,
- .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+ .vm_start = VSYSCALL_ADDR,
+ .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
- .vm_flags = VM_READ | VM_EXEC
+ .vm_flags = VM_READ | VM_EXEC,
+ .vm_ops = &gate_vma_ops,
};
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(tsk, TIF_IA32))
+ if (!mm || mm->context.ia32_compat)
return NULL;
#endif
return &gate_vma;
}
-int in_gate_area(struct task_struct *task, unsigned long addr)
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = get_gate_vma(task);
+ struct vm_area_struct *vma = get_gate_vma(mm);
if (!vma)
return 0;
@@ -890,22 +1220,50 @@ int in_gate_area(struct task_struct *task, unsigned long addr)
}
/*
- * Use this when you have no reliable task/vma, typically from interrupt
- * context. It is less reliable than using the task's vma and may give
- * false positives:
+ * Use this when you have no reliable mm, typically from interrupt
+ * context. It is less reliable than using a task's mm and may give
+ * false positives.
*/
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
{
- return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+ return (addr & PAGE_MASK) == VSYSCALL_ADDR;
+}
+
+static unsigned long probe_memory_block_size(void)
+{
+ /* start from 2g */
+ unsigned long bz = 1UL<<31;
+
+#ifdef CONFIG_X86_UV
+ if (is_uv_system()) {
+ printk(KERN_INFO "UV: memory block size 2GB\n");
+ return 2UL * 1024 * 1024 * 1024;
+ }
+#endif
+
+ /* less than 64g installed */
+ if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
+ return MIN_MEMORY_BLOCK_SIZE;
+
+ /* get the tail size */
+ while (bz > MIN_MEMORY_BLOCK_SIZE) {
+ if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
+ break;
+ bz >>= 1;
+ }
+
+ printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+
+ return bz;
}
-const char *arch_vma_name(struct vm_area_struct *vma)
+static unsigned long memory_block_size_probed;
+unsigned long memory_block_size_bytes(void)
{
- if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
- return "[vdso]";
- if (vma == &gate_vma)
- return "[vsyscall]";
- return NULL;
+ if (!memory_block_size_probed)
+ memory_block_size_probed = probe_memory_block_size();
+
+ return memory_block_size_probed;
}
#ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -916,18 +1274,17 @@ static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;
-int __meminit
-vmemmap_populate(struct page *start_page, unsigned long size, int node)
+static int __meminit vmemmap_populate_hugepages(unsigned long start,
+ unsigned long end, int node)
{
- unsigned long addr = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + size);
+ unsigned long addr;
unsigned long next;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- for (; addr < end; addr = next) {
- void *p = NULL;
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
@@ -937,31 +1294,14 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
if (!pud)
return -ENOMEM;
- if (!cpu_has_pse) {
- next = (addr + PAGE_SIZE) & PAGE_MASK;
- pmd = vmemmap_pmd_populate(pud, addr, node);
-
- if (!pmd)
- return -ENOMEM;
-
- p = vmemmap_pte_populate(pmd, addr, node);
-
- if (!p)
- return -ENOMEM;
-
- addr_end = addr + PAGE_SIZE;
- p_end = p + PAGE_SIZE;
- } else {
- next = pmd_addr_end(addr, end);
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ void *p;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ if (p) {
pte_t entry;
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
- if (!p)
- return -ENOMEM;
-
entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
PAGE_KERNEL_LARGE);
set_pmd(pmd, __pmd(pte_val(entry)));
@@ -978,15 +1318,92 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
- } else
- vmemmap_verify((pte_t *)pmd, node, addr, next);
+ continue;
+ }
+ } else if (pmd_large(*pmd)) {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ continue;
}
-
+ pr_warn_once("vmemmap: falling back to regular page backing\n");
+ if (vmemmap_populate_basepages(addr, next, node))
+ return -ENOMEM;
}
- sync_global_pgds((unsigned long)start_page, end);
return 0;
}
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+{
+ int err;
+
+ if (cpu_has_pse)
+ err = vmemmap_populate_hugepages(start, end, node);
+ else
+ err = vmemmap_populate_basepages(start, end, node);
+ if (!err)
+ sync_global_pgds(start, end - 1);
+ return err;
+}
+
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + size);
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned int nr_pages;
+ struct page *page;
+
+ for (; addr < end; addr = next) {
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+ if (!cpu_has_pse) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+ get_page_bootmem(section_nr, pmd_page(*pmd),
+ MIX_SECTION_INFO);
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ continue;
+ get_page_bootmem(section_nr, pte_page(*pte),
+ SECTION_INFO);
+ } else {
+ next = pmd_addr_end(addr, end);
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+
+ nr_pages = 1 << (get_order(PMD_SIZE));
+ page = pmd_page(*pmd);
+ while (nr_pages--)
+ get_page_bootmem(section_nr, page++,
+ SECTION_INFO);
+ }
+ }
+}
+#endif
+
void __meminit vmemmap_populate_print_last(void)
{
if (p_start) {
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0369843511d..baff1da354e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
+static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return 1;
+
+ WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
+
+ return 0;
+}
+
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
@@ -91,23 +106,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
return (__force void __iomem *)phys_to_virt(phys_addr);
/*
- * Check if the request spans more than any BAR in the iomem resource
- * tree.
- */
- WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
- KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
-
- /*
* Don't allow anybody to remap normal RAM that we're using..
*/
+ pfn = phys_addr >> PAGE_SHIFT;
last_pfn = last_addr >> PAGE_SHIFT;
- for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
- int is_ram = page_is_ram(pfn);
-
- if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
- return NULL;
- WARN_ON_ONCE(is_ram);
- }
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1)
+ return NULL;
/*
* Mappings have to be page-aligned
@@ -170,6 +175,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
ret_addr = (void __iomem *) (vaddr + offset);
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+ /*
+ * Check if the request spans more than any BAR in the iomem resource
+ * tree.
+ */
+ WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+ KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
return ret_addr;
err_free_area:
free_vm_area(area);
@@ -180,7 +192,7 @@ err_free_memtype:
/**
* ioremap_nocache - map bus memory into CPU space
- * @offset: bus address of the memory
+ * @phys_addr: bus address of the memory
* @size: size of the resource to map
*
* ioremap_nocache performs a platform specific sequence of operations to
@@ -217,7 +229,7 @@ EXPORT_SYMBOL(ioremap_nocache);
/**
* ioremap_wc - map memory into CPU space write combined
- * @offset: bus address of the memory
+ * @phys_addr: bus address of the memory
* @size: size of the resource to map
*
* This version of ioremap ensures that the memory is marked write combining.
@@ -282,12 +294,7 @@ void iounmap(volatile void __iomem *addr)
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
- read_lock(&vmlist_lock);
- for (p = vmlist; p; p = p->next) {
- if (p->addr == (void __force *)addr)
- break;
- }
- read_unlock(&vmlist_lock);
+ p = find_vm_area((void __force *)addr);
if (!p) {
printk(KERN_ERR "iounmap: bad address %p\n", addr);
@@ -333,17 +340,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
return;
}
-static int __initdata early_ioremap_debug;
-
-static int __init early_ioremap_debug_setup(char *str)
-{
- early_ioremap_debug = 1;
-
- return 0;
-}
-early_param("early_ioremap_debug", early_ioremap_debug_setup);
-
-static __initdata int after_paging_init;
static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
@@ -367,18 +363,17 @@ bool __init is_early_ioremap_ptep(pte_t *ptep)
return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
}
-static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
-
void __init early_ioremap_init(void)
{
pmd_t *pmd;
- int i;
- if (early_ioremap_debug)
- printk(KERN_INFO "early_ioremap_init()\n");
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#else
+ WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#endif
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+ early_ioremap_setup();
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
@@ -407,13 +402,8 @@ void __init early_ioremap_init(void)
}
}
-void __init early_ioremap_reset(void)
-{
- after_paging_init = 1;
-}
-
-static void __init __early_set_fixmap(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t flags)
+void __init __early_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
{
unsigned long addr = __fix_to_virt(idx);
pte_t *pte;
@@ -430,199 +420,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
pte_clear(&init_mm, addr, pte);
__flush_tlb_one(addr);
}
-
-static inline void __init early_set_fixmap(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t prot)
-{
- if (after_paging_init)
- __set_fixmap(idx, phys, prot);
- else
- __early_set_fixmap(idx, phys, prot);
-}
-
-static inline void __init early_clear_fixmap(enum fixed_addresses idx)
-{
- if (after_paging_init)
- clear_fixmap(idx);
- else
- __early_set_fixmap(idx, 0, __pgprot(0));
-}
-
-static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
-static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
-
-void __init fixup_early_ioremap(void)
-{
- int i;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (prev_map[i]) {
- WARN_ON(1);
- break;
- }
- }
-
- early_ioremap_init();
-}
-
-static int __init check_early_ioremap_leak(void)
-{
- int count = 0;
- int i;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- if (prev_map[i])
- count++;
-
- if (!count)
- return 0;
- WARN(1, KERN_WARNING
- "Debug warning: early ioremap leak of %d areas detected.\n",
- count);
- printk(KERN_WARNING
- "please boot with early_ioremap_debug and report the dmesg.\n");
-
- return 1;
-}
-late_initcall(check_early_ioremap_leak);
-
-static void __init __iomem *
-__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
-{
- unsigned long offset;
- resource_size_t last_addr;
- unsigned int nrpages;
- enum fixed_addresses idx0, idx;
- int i, slot;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- slot = -1;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (!prev_map[i]) {
- slot = i;
- break;
- }
- }
-
- if (slot < 0) {
- printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n",
- (u64)phys_addr, size);
- WARN_ON(1);
- return NULL;
- }
-
- if (early_ioremap_debug) {
- printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ",
- (u64)phys_addr, size, slot);
- dump_stack();
- }
-
- /* Don't allow wraparound or zero size */
- last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr) {
- WARN_ON(1);
- return NULL;
- }
-
- prev_size[slot] = size;
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr + 1) - phys_addr;
-
- /*
- * Mappings have to fit in the FIX_BTMAP area.
- */
- nrpages = size >> PAGE_SHIFT;
- if (nrpages > NR_FIX_BTMAPS) {
- WARN_ON(1);
- return NULL;
- }
-
- /*
- * Ok, go for it..
- */
- idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
- idx = idx0;
- while (nrpages > 0) {
- early_set_fixmap(idx, phys_addr, prot);
- phys_addr += PAGE_SIZE;
- --idx;
- --nrpages;
- }
- if (early_ioremap_debug)
- printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
-
- prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
- return prev_map[slot];
-}
-
-/* Remap an IO device */
-void __init __iomem *
-early_ioremap(resource_size_t phys_addr, unsigned long size)
-{
- return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
-}
-
-/* Remap memory */
-void __init __iomem *
-early_memremap(resource_size_t phys_addr, unsigned long size)
-{
- return __early_ioremap(phys_addr, size, PAGE_KERNEL);
-}
-
-void __init early_iounmap(void __iomem *addr, unsigned long size)
-{
- unsigned long virt_addr;
- unsigned long offset;
- unsigned int nrpages;
- enum fixed_addresses idx;
- int i, slot;
-
- slot = -1;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (prev_map[i] == addr) {
- slot = i;
- break;
- }
- }
-
- if (slot < 0) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
- addr, size);
- WARN_ON(1);
- return;
- }
-
- if (prev_size[slot] != size) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
- addr, size, slot, prev_size[slot]);
- WARN_ON(1);
- return;
- }
-
- if (early_ioremap_debug) {
- printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
- size, slot);
- dump_stack();
- }
-
- virt_addr = (unsigned long)addr;
- if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
- WARN_ON(1);
- return;
- }
- offset = virt_addr & ~PAGE_MASK;
- nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
-
- idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
- while (nrpages > 0) {
- early_clear_fixmap(idx);
- --idx;
- --nrpages;
- }
- prev_map[slot] = NULL;
-}
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436..dab41876cdd 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
- save_stack_trace_bp(&e->trace, regs->bp);
+ save_stack_trace_regs(regs, &e->trace);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index d87dd6d042d..dd89a13f105 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -78,10 +78,16 @@ early_initcall(kmemcheck_init);
*/
static int __init param_kmemcheck(char *str)
{
+ int val;
+ int ret;
+
if (!str)
return -EINVAL;
- sscanf(str, "%d", &kmemcheck_enabled);
+ ret = kstrtoint(str, 0, &val);
+ if (ret)
+ return ret;
+ kmemcheck_enabled = val;
return 0;
}
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 036efbea8b2..aef7140c006 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,3 +1,4 @@
+#include <linux/bug.h>
#include <linux/kernel.h>
#include "opcode.h"
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index e5d5e2ce9f7..637ab34ed63 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -11,7 +11,6 @@
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/hash.h>
-#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
deleted file mode 100644
index aa1169392b8..00000000000
--- a/arch/x86/mm/memblock.c
+++ /dev/null
@@ -1,348 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bitops.h>
-#include <linux/memblock.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/range.h>
-
-/* Check for already reserved areas */
-static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align)
-{
- struct memblock_region *r;
- u64 addr = *addrp, last;
- u64 size = *sizep;
- bool changed = false;
-
-again:
- last = addr + size;
- for_each_memblock(reserved, r) {
- if (last > r->base && addr < r->base) {
- size = r->base - addr;
- changed = true;
- goto again;
- }
- if (last > (r->base + r->size) && addr < (r->base + r->size)) {
- addr = round_up(r->base + r->size, align);
- size = last - addr;
- changed = true;
- goto again;
- }
- if (last <= (r->base + r->size) && addr >= r->base) {
- *sizep = 0;
- return false;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
-}
-
-/*
- * Find next free range after start, and size is returned in *sizep
- */
-u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
-{
- struct memblock_region *r;
-
- for_each_memblock(memory, r) {
- u64 ei_start = r->base;
- u64 ei_last = ei_start + r->size;
- u64 addr;
-
- addr = round_up(ei_start, align);
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (check_with_memblock_reserved_size(&addr, sizep, align))
- ;
-
- if (*sizep)
- return addr;
- }
-
- return MEMBLOCK_ERROR;
-}
-
-static __init struct range *find_range_array(int count)
-{
- u64 end, size, mem;
- struct range *range;
-
- size = sizeof(struct range) * count;
- end = memblock.current_limit;
-
- mem = memblock_find_in_range(0, end, size, sizeof(struct range));
- if (mem == MEMBLOCK_ERROR)
- panic("can not find more space for range array");
-
- /*
- * This range is tempoaray, so don't reserve it, it will not be
- * overlapped because We will not alloccate new buffer before
- * We discard this one
- */
- range = __va(mem);
- memset(range, 0, size);
-
- return range;
-}
-
-static void __init memblock_x86_subtract_reserved(struct range *range, int az)
-{
- u64 final_start, final_end;
- struct memblock_region *r;
-
- /* Take out region array itself at first*/
- memblock_free_reserved_regions();
-
- memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
-
- for_each_memblock(reserved, r) {
- memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
- final_start = PFN_DOWN(r->base);
- final_end = PFN_UP(r->base + r->size);
- if (final_start >= final_end)
- continue;
- subtract_range(range, az, final_start, final_end);
- }
-
- /* Put region array back ? */
- memblock_reserve_reserved_regions();
-}
-
-struct count_data {
- int nr;
-};
-
-static int __init count_work_fn(unsigned long start_pfn,
- unsigned long end_pfn, void *datax)
-{
- struct count_data *data = datax;
-
- data->nr++;
-
- return 0;
-}
-
-static int __init count_early_node_map(int nodeid)
-{
- struct count_data data;
-
- data.nr = 0;
- work_with_active_regions(nodeid, count_work_fn, &data);
-
- return data.nr;
-}
-
-int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- int count;
- struct range *range;
- int nr_range;
-
- count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
-
- range = find_range_array(count);
- nr_range = 0;
-
- /*
- * Use early_node_map[] and memblock.reserved.region to get range array
- * at first
- */
- nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
- subtract_range(range, count, 0, start_pfn);
- subtract_range(range, count, end_pfn, -1ULL);
-
- memblock_x86_subtract_reserved(range, count);
- nr_range = clean_sort_range(range, count);
-
- *rangep = range;
- return nr_range;
-}
-
-int __init get_free_all_memory_range(struct range **rangep, int nodeid)
-{
- unsigned long end_pfn = -1UL;
-
-#ifdef CONFIG_X86_32
- end_pfn = max_low_pfn;
-#endif
- return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
-}
-
-static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
-{
- int i, count;
- struct range *range;
- int nr_range;
- u64 final_start, final_end;
- u64 free_size;
- struct memblock_region *r;
-
- count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
-
- range = find_range_array(count);
- nr_range = 0;
-
- addr = PFN_UP(addr);
- limit = PFN_DOWN(limit);
-
- for_each_memblock(memory, r) {
- final_start = PFN_UP(r->base);
- final_end = PFN_DOWN(r->base + r->size);
- if (final_start >= final_end)
- continue;
- if (final_start >= limit || final_end <= addr)
- continue;
-
- nr_range = add_range(range, count, nr_range, final_start, final_end);
- }
- subtract_range(range, count, 0, addr);
- subtract_range(range, count, limit, -1ULL);
-
- /* Subtract memblock.reserved.region in range ? */
- if (!get_free)
- goto sort_and_count_them;
- for_each_memblock(reserved, r) {
- final_start = PFN_DOWN(r->base);
- final_end = PFN_UP(r->base + r->size);
- if (final_start >= final_end)
- continue;
- if (final_start >= limit || final_end <= addr)
- continue;
-
- subtract_range(range, count, final_start, final_end);
- }
-
-sort_and_count_them:
- nr_range = clean_sort_range(range, count);
-
- free_size = 0;
- for (i = 0; i < nr_range; i++)
- free_size += range[i].end - range[i].start;
-
- return free_size << PAGE_SHIFT;
-}
-
-u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
-{
- return __memblock_x86_memory_in_range(addr, limit, true);
-}
-
-u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
-{
- return __memblock_x86_memory_in_range(addr, limit, false);
-}
-
-void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
-{
- if (start == end)
- return;
-
- if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
- return;
-
- memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
-
- memblock_reserve(start, end - start);
-}
-
-void __init memblock_x86_free_range(u64 start, u64 end)
-{
- if (start == end)
- return;
-
- if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
- return;
-
- memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
-
- memblock_free(start, end - start);
-}
-
-/*
- * Need to call this function after memblock_x86_register_active_regions,
- * so early_node_map[] is filled already.
- */
-u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
-{
- u64 addr;
- addr = find_memory_core_early(nid, size, align, start, end);
- if (addr != MEMBLOCK_ERROR)
- return addr;
-
- /* Fallback, should already have start end within node range */
- return memblock_find_in_range(start, end, size, align);
-}
-
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
- */
-static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
- unsigned long start_pfn,
- unsigned long last_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn)
-{
- u64 align = PAGE_SIZE;
-
- *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
- *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
-
- /* Skip map entries smaller than a page */
- if (*ei_startpfn >= *ei_endpfn)
- return 0;
-
- /* Skip if map is outside the node */
- if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
- return 0;
-
- /* Check for overlaps */
- if (*ei_startpfn < start_pfn)
- *ei_startpfn = start_pfn;
- if (*ei_endpfn > last_pfn)
- *ei_endpfn = last_pfn;
-
- return 1;
-}
-
-/* Walk the memblock.memory map and register active regions within a node */
-void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned long ei_startpfn;
- unsigned long ei_endpfn;
- struct memblock_region *r;
-
- for_each_memblock(memory, r)
- if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-u64 __init memblock_x86_hole_size(u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long last_pfn = end >> PAGE_SHIFT;
- unsigned long ei_startpfn, ei_endpfn, ram = 0;
- struct memblock_region *r;
-
- for_each_memblock(memory, r)
- if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- ram += ei_endpfn - ei_startpfn;
-
- return end - start - ((u64)ram << PAGE_SHIFT);
-}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 92faf3a1c53..1e9da795767 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -9,6 +9,7 @@
#include <linux/memblock.h>
static u64 patterns[] __initdata = {
+ /* The first entry has to be 0 to leave memtest with zeroed memory */
0,
0xffffffffffffffffULL,
0x5555555555555555ULL,
@@ -34,7 +35,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
(unsigned long long) pattern,
(unsigned long long) start_bad,
(unsigned long long) end_bad);
- memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM");
+ memblock_reserve(start_bad, end_bad - start_bad);
}
static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -70,24 +71,19 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size)
static void __init do_one_pass(u64 pattern, u64 start, u64 end)
{
- u64 size = 0;
-
- while (start < end) {
- start = memblock_x86_find_in_range_size(start, &size, 1);
-
- /* done ? */
- if (start >= end)
- break;
- if (start + size > end)
- size = end - start;
-
- printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
- (unsigned long long) start,
- (unsigned long long) start + size,
- (unsigned long long) cpu_to_be64(pattern));
- memtest(pattern, start, size);
-
- start += size;
+ u64 i;
+ phys_addr_t this_start, this_end;
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+ this_start = clamp_t(phys_addr_t, this_start, start, end);
+ this_end = clamp_t(phys_addr_t, this_end, start, end);
+ if (this_start < this_end) {
+ printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
+ (unsigned long long)this_start,
+ (unsigned long long)this_end,
+ (unsigned long long)cpu_to_be64(pattern));
+ memtest(pattern, this_start, this_end - this_start);
+ }
}
}
@@ -115,15 +111,8 @@ void __init early_memtest(unsigned long start, unsigned long end)
return;
printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
- for (i = 0; i < memtest_pattern; i++) {
+ for (i = memtest_pattern-1; i < UINT_MAX; --i) {
idx = i % ARRAY_SIZE(patterns);
do_one_pass(patterns[idx], start, end);
}
-
- if (idx > 0) {
- printk(KERN_INFO "early_memtest: wipe out "
- "test pattern from memory\n");
- /* additional test with pattern 0 will do this */
- do_one_pass(0, start, end);
- }
}
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 00000000000..6b563a11889
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,19 @@
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+ return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab5194fd9..25e7e1372bb 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -31,6 +31,10 @@
#include <linux/sched.h>
#include <asm/elf.h>
+struct __read_mostly va_alignment va_align = {
+ .flags = -1,
+};
+
static unsigned int stack_maxrandom_size(void)
{
unsigned int max = 0;
@@ -42,7 +46,6 @@ static unsigned int stack_maxrandom_size(void)
return max;
}
-
/*
* Top of mmap area (just below the process stack).
*
@@ -51,21 +54,6 @@ static unsigned int stack_maxrandom_size(void)
#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)
-/*
- * True on X86_32 or when emulating IA32 on X86_64
- */
-static int mmap_is_ia32(void)
-{
-#ifdef CONFIG_X86_32
- return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
- return 1;
-#endif
- return 0;
-}
-
static int mmap_is_legacy(void)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
@@ -87,9 +75,9 @@ static unsigned long mmap_rnd(void)
*/
if (current->flags & PF_RANDOMIZE) {
if (mmap_is_ia32())
- rnd = (long)get_random_int() % (1<<8);
+ rnd = get_random_int() % (1<<8);
else
- rnd = (long)(get_random_int() % (1<<28));
+ rnd = get_random_int() % (1<<28);
}
return rnd << PAGE_SHIFT;
}
@@ -124,13 +112,13 @@ static unsigned long mmap_legacy_base(void)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
+ mm->mmap_legacy_base = mmap_legacy_base();
+ mm->mmap_base = mmap_base();
+
if (mmap_is_legacy()) {
- mm->mmap_base = mmap_legacy_base();
+ mm->mmap_base = mm->mmap_legacy_base;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
- mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 3adff7dcc14..0057a7accfb 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -29,12 +29,11 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/io.h>
-#include <linux/version.h>
#include <linux/kallsyms.h>
#include <asm/pgtable.h>
#include <linux/mmiotrace.h>
#include <asm/e820.h> /* for ISA_START_ADDRESS */
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
@@ -76,8 +75,8 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
/* module parameters */
static unsigned long filter_offset;
-static int nommiotrace;
-static int trace_pc;
+static bool nommiotrace;
+static bool trace_pc;
module_param(filter_offset, ulong, 0);
module_param(nommiotrace, bool, 0);
@@ -411,9 +410,7 @@ out:
pr_warning("multiple CPUs still online, may miss events.\n");
}
-/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
- but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
-static void __ref leave_uniprocessor(void)
+static void leave_uniprocessor(void)
{
int cpu;
int err;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c..a32b706c401 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,31 +1,124 @@
/* Common code for 32 and 64-bit NUMA */
-#include <linux/topology.h>
-#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+#include <linux/topology.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/acpi.h>
+#include <asm/amd_nb.h>
+
+#include "numa_internal.h"
+
+int __initdata numa_off;
+nodemask_t numa_nodes_parsed __initdata;
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
+static struct numa_meminfo numa_meminfo
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
+
+static __init int numa_setup(char *opt)
+{
+ if (!opt)
+ return -EINVAL;
+ if (!strncmp(opt, "off", 3))
+ numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+ if (!strncmp(opt, "fake=", 5))
+ numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ if (!strncmp(opt, "noacpi", 6))
+ acpi_numa = -1;
+#endif
+ return 0;
+}
+early_param("numa", numa_setup);
/*
- * Which logical CPUs are on which nodes
+ * apicid, cpu, node mappings
*/
+s16 __apicid_to_node[MAX_LOCAL_APIC] = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
+int numa_cpu_node(int cpu)
+{
+ int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+
+ if (apicid != BAD_APICID)
+ return __apicid_to_node[apicid];
+ return NUMA_NO_NODE;
+}
+
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+void numa_set_node(int cpu, int node)
+{
+ int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+ /* early setting, no percpu area yet */
+ if (cpu_to_node_map) {
+ cpu_to_node_map[cpu] = node;
+ return;
+ }
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+ printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+ dump_stack();
+ return;
+ }
+#endif
+ per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+ set_cpu_numa_node(cpu, node);
+}
+
+void numa_clear_node(int cpu)
+{
+ numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+/*
* Allocate node_to_cpumask_map based on number of available nodes
* Requires node_possible_map to be valid.
*
- * Note: node_to_cpumask() is not valid until after this is done.
+ * Note: cpumask_of_node() is not valid until after this is done.
* (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
*/
void __init setup_node_to_cpumask_map(void)
{
- unsigned int node, num = 0;
+ unsigned int node;
/* setup nr_node_ids if not done yet */
- if (nr_node_ids == MAX_NUMNODES) {
- for_each_node_mask(node, node_possible_map)
- num = node;
- nr_node_ids = num + 1;
- }
+ if (nr_node_ids == MAX_NUMNODES)
+ setup_nr_node_ids();
/* allocate the map */
for (node = 0; node < nr_node_ids; node++)
@@ -35,7 +128,719 @@ void __init setup_node_to_cpumask_map(void)
pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
}
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+ struct numa_meminfo *mi)
+{
+ /* ignore zero length blks */
+ if (start == end)
+ return 0;
+
+ /* whine about and ignore invalid blks */
+ if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+ pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+ nid, start, end - 1);
+ return 0;
+ }
+
+ if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("NUMA: too many memblk ranges\n");
+ return -EINVAL;
+ }
+
+ mi->blk[mi->nr_blks].start = start;
+ mi->blk[mi->nr_blks].end = end;
+ mi->blk[mi->nr_blks].nid = nid;
+ mi->nr_blks++;
+ return 0;
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+ mi->nr_blks--;
+ memmove(&mi->blk[idx], &mi->blk[idx + 1],
+ (mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
+/* Initialize NODE_DATA for a node on the local memory */
+static void __init setup_node_data(int nid, u64 start, u64 end)
+{
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ u64 nd_pa;
+ void *nd;
+ int tnid;
+
+ /*
+ * Don't confuse VM with a node that doesn't have the
+ * minimum amount of memory:
+ */
+ if (end && (end - start) < NODE_MIN_SIZE)
+ return;
+
+ start = roundup(start, ZONE_ALIGN);
+
+ printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
+ nid, start, end - 1);
+
+ /*
+ * Allocate node data. Try node-local memory and then any node.
+ * Never allocate in DMA zone.
+ */
+ nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa) {
+ nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
+ MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (!nd_pa) {
+ pr_err("Cannot find %zu bytes in node %d\n",
+ nd_size, nid);
+ return;
+ }
+ }
+ nd = __va(nd_pa);
+
+ /* report and initialize */
+ printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
+ nd_pa, nd_pa + nd_size - 1);
+ tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+ if (tnid != nid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
+
+ node_data[nid] = nd;
+ memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+ NODE_DATA(nid)->node_id = nid;
+ NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
+ NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
+
+ node_set_online(nid);
+}
+
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unncessary memblks. Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+ const u64 low = 0;
+ const u64 high = PFN_PHYS(max_pfn);
+ int i, j, k;
+
+ /* first, trim all entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ /* make sure all blocks are inside the limits */
+ bi->start = max(bi->start, low);
+ bi->end = min(bi->end, high);
+
+ /* and there's no empty block */
+ if (bi->start >= bi->end)
+ numa_remove_memblk_from(i--, mi);
+ }
+
+ /* merge neighboring / overlapping entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ for (j = i + 1; j < mi->nr_blks; j++) {
+ struct numa_memblk *bj = &mi->blk[j];
+ u64 start, end;
+
+ /*
+ * See whether there are overlapping blocks. Whine
+ * about but allow overlaps of the same nid. They
+ * will be merged below.
+ */
+ if (bi->end > bj->start && bi->start < bj->end) {
+ if (bi->nid != bj->nid) {
+ pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->nid, bj->start, bj->end - 1);
+ return -EINVAL;
+ }
+ pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->start, bj->end - 1);
+ }
+
+ /*
+ * Join together blocks on the same node, holes
+ * between which don't overlap with memory on other
+ * nodes.
+ */
+ if (bi->nid != bj->nid)
+ continue;
+ start = min(bi->start, bj->start);
+ end = max(bi->end, bj->end);
+ for (k = 0; k < mi->nr_blks; k++) {
+ struct numa_memblk *bk = &mi->blk[k];
+
+ if (bi->nid == bk->nid)
+ continue;
+ if (start < bk->end && end > bk->start)
+ break;
+ }
+ if (k < mi->nr_blks)
+ continue;
+ printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1, bj->start,
+ bj->end - 1, start, end - 1);
+ bi->start = start;
+ bi->end = end;
+ numa_remove_memblk_from(j--, mi);
+ }
+ }
+
+ /* clear unused ones */
+ for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+ mi->blk[i].start = mi->blk[i].end = 0;
+ mi->blk[i].nid = NUMA_NO_NODE;
+ }
+
+ return 0;
+}
+
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+ const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+ if (mi->blk[i].start != mi->blk[i].end &&
+ mi->blk[i].nid != NUMA_NO_NODE)
+ node_set(mi->blk[i].nid, *nodemask);
+}
+
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed. The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+ size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
+
+ /* numa_distance could be 1LU marking allocation failure, test cnt */
+ if (numa_distance_cnt)
+ memblock_free(__pa(numa_distance), size);
+ numa_distance_cnt = 0;
+ numa_distance = NULL; /* enable table creation */
+}
+
+static int __init numa_alloc_distance(void)
+{
+ nodemask_t nodes_parsed;
+ size_t size;
+ int i, j, cnt = 0;
+ u64 phys;
+
+ /* size the new table and allocate it */
+ nodes_parsed = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+ for_each_node_mask(i, nodes_parsed)
+ cnt = i;
+ cnt++;
+ size = cnt * cnt * sizeof(numa_distance[0]);
+
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ size, PAGE_SIZE);
+ if (!phys) {
+ pr_warning("NUMA: Warning: can't allocate distance table!\n");
+ /* don't retry until explicitly reset */
+ numa_distance = (void *)1LU;
+ return -ENOMEM;
+ }
+ memblock_reserve(phys, size);
+
+ numa_distance = __va(phys);
+ numa_distance_cnt = cnt;
+
+ /* fill with the default distances */
+ for (i = 0; i < cnt; i++)
+ for (j = 0; j < cnt; j++)
+ numa_distance[i * cnt + j] = i == j ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+
+ return 0;
+}
+
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to' node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance. If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node or lower than zero
+ * at the time of table creation or @distance doesn't make sense, the call
+ * is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+ if (!numa_distance && numa_alloc_distance() < 0)
+ return;
+
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+ from < 0 || to < 0) {
+ pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ if ((u8)distance != distance ||
+ (from == to && distance != LOCAL_DISTANCE)) {
+ pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+int __node_distance(int from, int to)
+{
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+ return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+ return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common). Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+ u64 numaram, e820ram;
+ int i;
+
+ numaram = 0;
+ for (i = 0; i < mi->nr_blks; i++) {
+ u64 s = mi->blk[i].start >> PAGE_SHIFT;
+ u64 e = mi->blk[i].end >> PAGE_SHIFT;
+ numaram += e - s;
+ numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+ if ((s64)numaram < 0)
+ numaram = 0;
+ }
+
+ e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
+
+ /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+ if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+ printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
+ (numaram << PAGE_SHIFT) >> 20,
+ (e820ram << PAGE_SHIFT) >> 20);
+ return false;
+ }
+ return true;
+}
+
+static int __init numa_register_memblks(struct numa_meminfo *mi)
+{
+ unsigned long uninitialized_var(pfn_align);
+ int i, nid;
+
+ /* Account for nodes with cpus and no memory */
+ node_possible_map = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&node_possible_map, mi);
+ if (WARN_ON(nodes_empty(node_possible_map)))
+ return -EINVAL;
+
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *mb = &mi->blk[i];
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.memory, mb->nid);
+ }
+
+ /*
+ * If sections array is gonna be used for pfn -> nid mapping, check
+ * whether its granularity is fine enough.
+ */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ pfn_align = node_map_pfn_alignment();
+ if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+ printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+ PFN_PHYS(pfn_align) >> 20,
+ PFN_PHYS(PAGES_PER_SECTION) >> 20);
+ return -EINVAL;
+ }
+#endif
+ if (!numa_meminfo_cover_memory(mi))
+ return -EINVAL;
+
+ /* Finally register nodes. */
+ for_each_node_mask(nid, node_possible_map) {
+ u64 start = PFN_PHYS(max_pfn);
+ u64 end = 0;
+
+ for (i = 0; i < mi->nr_blks; i++) {
+ if (nid != mi->blk[i].nid)
+ continue;
+ start = min(mi->blk[i].start, start);
+ end = max(mi->blk[i].end, end);
+ }
+
+ if (start < end)
+ setup_node_data(nid, start, end);
+ }
+
+ /* Dump memblock with node info and return. */
+ memblock_dump_all();
+ return 0;
+}
+
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
+static void __init numa_init_array(void)
+{
+ int rr, i;
+
+ rr = first_node(node_online_map);
+ for (i = 0; i < nr_cpu_ids; i++) {
+ if (early_cpu_to_node(i) != NUMA_NO_NODE)
+ continue;
+ numa_set_node(i, rr);
+ rr = next_node(rr, node_online_map);
+ if (rr == MAX_NUMNODES)
+ rr = first_node(node_online_map);
+ }
+}
+
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+ int i, nid;
+ nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
+ unsigned long start, end;
+ struct memblock_region *r;
+
+ /*
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel. Set the nid in memblock.reserved will
+ * mark out all the nodes the kernel resides in.
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = &numa_meminfo.blk[i];
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.reserved, mb->nid);
+ }
+
+ /* Mark all kernel nodes. */
+ for_each_memblock(reserved, r)
+ node_set(r->nid, numa_kernel_nodes);
+
+ /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ nid = numa_meminfo.blk[i].nid;
+ if (!node_isset(nid, numa_kernel_nodes))
+ continue;
+
+ start = numa_meminfo.blk[i].start;
+ end = numa_meminfo.blk[i].end;
+
+ memblock_clear_hotplug(start, end - start);
+ }
+}
+
+static int __init numa_init(int (*init_func)(void))
+{
+ int i;
+ int ret;
+
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ set_apicid_to_node(i, NUMA_NO_NODE);
+
+ nodes_clear(numa_nodes_parsed);
+ nodes_clear(node_possible_map);
+ nodes_clear(node_online_map);
+ memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
+ MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
+ MAX_NUMNODES));
+ /* In case that parsing SRAT failed. */
+ WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
+ numa_reset_distance();
+
+ ret = init_func();
+ if (ret < 0)
+ return ret;
+
+ /*
+ * We reset memblock back to the top-down direction
+ * here because if we configured ACPI_NUMA, we have
+ * parsed SRAT in init_func(). It is ok to have the
+ * reset here even if we did't configure ACPI_NUMA
+ * or acpi numa init fails and fallbacks to dummy
+ * numa init.
+ */
+ memblock_set_bottom_up(false);
+
+ ret = numa_cleanup_meminfo(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ numa_emulation(&numa_meminfo, numa_distance_cnt);
+
+ ret = numa_register_memblks(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_cpu_ids; i++) {
+ int nid = early_cpu_to_node(i);
+
+ if (nid == NUMA_NO_NODE)
+ continue;
+ if (!node_online(nid))
+ numa_clear_node(i);
+ }
+ numa_init_array();
+
+ /*
+ * At very early time, the kernel have to use some memory such as
+ * loading the kernel image. We cannot prevent this anyway. So any
+ * node the kernel resides in should be un-hotpluggable.
+ *
+ * And when we come here, numa_init() won't fail.
+ */
+ numa_clear_kernel_node_hotplug();
+
+ return 0;
+}
+
+/**
+ * dummy_numa_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory. This function must not fail.
+ */
+static int __init dummy_numa_init(void)
+{
+ printk(KERN_INFO "%s\n",
+ numa_off ? "NUMA turned off" : "No NUMA configuration found");
+ printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
+ 0LLU, PFN_PHYS(max_pfn) - 1);
+
+ node_set(0, numa_nodes_parsed);
+ numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+
+ return 0;
+}
+
+/**
+ * x86_numa_init - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds. The
+ * last fallback is dummy single node config encomapssing whole memory and
+ * never fails.
+ */
+void __init x86_numa_init(void)
+{
+ if (!numa_off) {
+#ifdef CONFIG_ACPI_NUMA
+ if (!numa_init(x86_acpi_numa_init))
+ return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+ if (!numa_init(amd_numa_init))
+ return;
+#endif
+ }
+
+ numa_init(dummy_numa_init);
+}
+
+static __init int find_near_online_node(int node)
+{
+ int n, val;
+ int min_val = INT_MAX;
+ int best_node = -1;
+
+ for_each_online_node(n) {
+ val = node_distance(node, n);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ return best_node;
+}
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
+ */
+void __init init_cpu_to_node(void)
+{
+ int cpu;
+ u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+
+ BUG_ON(cpu_to_apicid == NULL);
+
+ for_each_possible_cpu(cpu) {
+ int node = numa_cpu_node(cpu);
+
+ if (node == NUMA_NO_NODE)
+ continue;
+ if (!node_online(node))
+ node = find_near_online_node(node);
+ numa_set_node(cpu, node);
+ }
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+# ifndef CONFIG_NUMA_EMU
+void numa_add_cpu(int cpu)
+{
+ cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+# endif /* !CONFIG_NUMA_EMU */
+
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+int __cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+ printk(KERN_WARNING
+ "cpu_to_node(%d): usage too early!\n", cpu);
+ dump_stack();
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(__cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map))
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+ if (!cpu_possible(cpu)) {
+ printk(KERN_WARNING
+ "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+ dump_stack();
+ return NUMA_NO_NODE;
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+void debug_cpumask_set_cpu(int cpu, int node, bool enable)
+{
+ struct cpumask *mask;
+ char buf[64];
+
+ if (node == NUMA_NO_NODE) {
+ /* early_cpu_to_node() already emits a warning and trace */
+ return;
+ }
+ mask = node_to_cpumask_map[node];
+ if (!mask) {
+ pr_err("node_to_cpumask_map[%i] NULL\n", node);
+ dump_stack();
+ return;
+ }
+
+ if (enable)
+ cpumask_set_cpu(cpu, mask);
+ else
+ cpumask_clear_cpu(cpu, mask);
+
+ cpulist_scnprintf(buf, sizeof(buf), mask);
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+ enable ? "numa_add_cpu" : "numa_remove_cpu",
+ cpu, node, buf);
+ return;
+}
+
+# ifndef CONFIG_NUMA_EMU
+static void numa_set_cpumask(int cpu, bool enable)
+{
+ debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
+}
+
+void numa_add_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, false);
+}
+# endif /* !CONFIG_NUMA_EMU */
+
/*
* Returns a pointer to the bitmask of CPUs on Node 'node'.
*/
@@ -58,4 +863,20 @@ const struct cpumask *cpumask_of_node(int node)
return node_to_cpumask_map[node];
}
EXPORT_SYMBOL(cpumask_of_node);
+
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int memory_add_physaddr_to_nid(u64 start)
+{
+ struct numa_meminfo *mi = &numa_meminfo;
+ int nid = mi->blk[0].nid;
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].start <= start && mi->blk[i].end > start)
+ nid = mi->blk[i].nid;
+ return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84a3e4c9f27..47b6436e41c 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,39 +22,11 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/highmem.h>
-#include <linux/initrd.h>
-#include <linux/nodemask.h>
#include <linux/module.h>
-#include <linux/kexec.h>
-#include <linux/pfn.h>
-#include <linux/swap.h>
-#include <linux/acpi.h>
-
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/mmzone.h>
-#include <asm/bios_ebda.h>
-#include <asm/proto.h>
-
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-/*
- * numa interface - we expect the numa architecture specific code to have
- * populated the following initialisation.
- *
- * 1) node_online_map - the map of all nodes configured (online) in the system
- * 2) node_start_pfn - the starting page frame number for a node
- * 3) node_end_pfn - the ending page fram number for a node
- */
-unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
-unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
+#include "numa_internal.h"
#ifdef CONFIG_DISCONTIGMEM
/*
@@ -69,7 +41,7 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
* physnode_map[16-31] = 1;
* physnode_map[32- ] = -1;
*/
-s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
EXPORT_SYMBOL(physnode_map);
void memory_present(int nid, unsigned long start, unsigned long end)
@@ -80,8 +52,10 @@ void memory_present(int nid, unsigned long start, unsigned long end)
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
- for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
- physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
+ start = round_down(start, PAGES_PER_SECTION);
+ end = round_up(end, PAGES_PER_SECTION);
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ physnode_map[pfn / PAGES_PER_SECTION] = nid;
printk(KERN_CONT "%lx ", pfn);
}
printk(KERN_CONT "\n");
@@ -99,301 +73,20 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
}
#endif
-extern unsigned long find_max_low_pfn(void);
extern unsigned long highend_pfn, highstart_pfn;
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-unsigned long node_remap_size[MAX_NUMNODES];
-static void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-static unsigned long kva_start_pfn;
-static unsigned long kva_pages;
-/*
- * FLAT - support for basic PC memory model with discontig enabled, essentially
- * a single node with all available processors in it with a flat
- * memory map.
- */
-int __init get_memcfg_numa_flat(void)
-{
- printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
-
- node_start_pfn[0] = 0;
- node_end_pfn[0] = max_pfn;
- memblock_x86_register_active_regions(0, 0, max_pfn);
- memory_present(0, 0, max_pfn);
- node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
-
- /* Indicate there is one node available. */
- nodes_clear(node_online_map);
- node_set_online(0);
- return 1;
-}
-
-/*
- * Find the highest page frame number we have available for the node
- */
-static void __init propagate_e820_map_node(int nid)
-{
- if (node_end_pfn[nid] > max_pfn)
- node_end_pfn[nid] = max_pfn;
- /*
- * if a user has given mem=XXXX, then we need to make sure
- * that the node _starts_ before that, too, not just ends
- */
- if (node_start_pfn[nid] > max_pfn)
- node_start_pfn[nid] = max_pfn;
- BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
-}
-
-/*
- * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
- * method. For node zero take this from the bottom of memory, for
- * subsequent nodes place them at node_remap_start_vaddr which contains
- * node local data in physically node local memory. See setup_memory()
- * for details.
- */
-static void __init allocate_pgdat(int nid)
-{
- char buf[16];
-
- if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
- NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
- else {
- unsigned long pgdat_phys;
- pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
- max_pfn_mapped<<PAGE_SHIFT,
- sizeof(pg_data_t),
- PAGE_SIZE);
- NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
- memset(buf, 0, sizeof(buf));
- sprintf(buf, "NODE_DATA %d", nid);
- memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
- }
- printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
- nid, (unsigned long)NODE_DATA(nid));
-}
-
-/*
- * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
- * virtual address space (KVA) is reserved and portions of nodes are mapped
- * using it. This is to allow node-local memory to be allocated for
- * structures that would normally require ZONE_NORMAL. The memory is
- * allocated with alloc_remap() and callers should be prepared to allocate
- * from the bootmem allocator instead.
- */
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
-
-void *alloc_remap(int nid, unsigned long size)
-{
- void *allocation = node_remap_alloc_vaddr[nid];
-
- size = ALIGN(size, L1_CACHE_BYTES);
-
- if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
- return NULL;
-
- node_remap_alloc_vaddr[nid] += size;
- memset(allocation, 0, size);
-
- return allocation;
-}
-
-static void __init remap_numa_kva(void)
-{
- void *vaddr;
- unsigned long pfn;
- int node;
-
- for_each_online_node(node) {
- printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
- for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
- vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
- printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
- (unsigned long)vaddr,
- node_remap_start_pfn[node] + pfn);
- set_pmd_pfn((ulong) vaddr,
- node_remap_start_pfn[node] + pfn,
- PAGE_KERNEL_LARGE);
- }
- }
-}
-
-#ifdef CONFIG_HIBERNATION
-/**
- * resume_map_numa_kva - add KVA mapping to the temporary page tables created
- * during resume from hibernation
- * @pgd_base - temporary resume page directory
- */
-void resume_map_numa_kva(pgd_t *pgd_base)
-{
- int node;
-
- for_each_online_node(node) {
- unsigned long start_va, start_pfn, size, pfn;
-
- start_va = (unsigned long)node_remap_start_vaddr[node];
- start_pfn = node_remap_start_pfn[node];
- size = node_remap_size[node];
-
- printk(KERN_DEBUG "%s: node %d\n", __func__, node);
-
- for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
- unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
- pgd_t *pgd = pgd_base + pgd_index(vaddr);
- pud_t *pud = pud_offset(pgd, vaddr);
- pmd_t *pmd = pmd_offset(pud, vaddr);
-
- set_pmd(pmd, pfn_pmd(start_pfn + pfn,
- PAGE_KERNEL_LARGE_EXEC));
-
- printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
- __func__, vaddr, start_pfn + pfn);
- }
- }
-}
-#endif
-
-static __init unsigned long calculate_numa_remap_pages(void)
-{
- int nid;
- unsigned long size, reserve_pages = 0;
-
- for_each_online_node(nid) {
- u64 node_kva_target;
- u64 node_kva_final;
-
- /*
- * The acpi/srat node info can show hot-add memroy zones
- * where memory could be added but not currently present.
- */
- printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
- nid, node_start_pfn[nid], node_end_pfn[nid]);
- if (node_start_pfn[nid] > max_pfn)
- continue;
- if (!node_end_pfn[nid])
- continue;
- if (node_end_pfn[nid] > max_pfn)
- node_end_pfn[nid] = max_pfn;
-
- /* ensure the remap includes space for the pgdat. */
- size = node_remap_size[nid] + sizeof(pg_data_t);
-
- /* convert size to large (pmd size) pages, rounding up */
- size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
- /* now the roundup is correct, convert to PAGE_SIZE pages */
- size = size * PTRS_PER_PTE;
-
- node_kva_target = round_down(node_end_pfn[nid] - size,
- PTRS_PER_PTE);
- node_kva_target <<= PAGE_SHIFT;
- do {
- node_kva_final = memblock_find_in_range(node_kva_target,
- ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
- ((u64)size)<<PAGE_SHIFT,
- LARGE_PAGE_BYTES);
- node_kva_target -= LARGE_PAGE_BYTES;
- } while (node_kva_final == MEMBLOCK_ERROR &&
- (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
-
- if (node_kva_final == MEMBLOCK_ERROR)
- panic("Can not get kva ram\n");
-
- node_remap_size[nid] = size;
- node_remap_offset[nid] = reserve_pages;
- reserve_pages += size;
- printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
- " node %d at %llx\n",
- size, nid, node_kva_final>>PAGE_SHIFT);
-
- /*
- * prevent kva address below max_low_pfn want it on system
- * with less memory later.
- * layout will be: KVA address , KVA RAM
- *
- * we are supposed to only record the one less then max_low_pfn
- * but we could have some hole in high memory, and it will only
- * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
- * to use it as free.
- * So memblock_x86_reserve_range here, hope we don't run out of that array
- */
- memblock_x86_reserve_range(node_kva_final,
- node_kva_final+(((u64)size)<<PAGE_SHIFT),
- "KVA RAM");
-
- node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
- }
- printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
- reserve_pages);
- return reserve_pages;
-}
-
-static void init_remap_allocator(int nid)
-{
- node_remap_start_vaddr[nid] = pfn_to_kaddr(
- kva_start_pfn + node_remap_offset[nid]);
- node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
- (node_remap_size[nid] * PAGE_SIZE);
- node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
- ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
- printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
- (ulong) node_remap_start_vaddr[nid],
- (ulong) node_remap_end_vaddr[nid]);
-}
-
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
- int acpi, int k8)
+void __init initmem_init(void)
{
- int nid;
- long kva_target_pfn;
+ x86_numa_init();
- /*
- * When mapping a NUMA machine we allocate the node_mem_map arrays
- * from node local memory. They are then mapped directly into KVA
- * between zone normal and vmalloc space. Calculate the size of
- * this space and use it to adjust the boundary between ZONE_NORMAL
- * and ZONE_HIGHMEM.
- */
-
- get_memcfg_numa();
-
- kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
-
- kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
- do {
- kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
- max_low_pfn<<PAGE_SHIFT,
- kva_pages<<PAGE_SHIFT,
- PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
- kva_target_pfn -= PTRS_PER_PTE;
- } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
-
- if (kva_start_pfn == MEMBLOCK_ERROR)
- panic("Can not get kva space\n");
-
- printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
- kva_start_pfn, max_low_pfn);
- printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
-
- /* avoid clash with initrd */
- memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
- (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
- "KVA PG");
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
- num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
@@ -403,51 +96,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
- for_each_online_node(nid) {
- init_remap_allocator(nid);
-
- allocate_pgdat(nid);
- }
- remap_numa_kva();
printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
- for_each_online_node(nid)
- propagate_e820_map_node(nid);
-
- for_each_online_node(nid) {
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
- NODE_DATA(nid)->node_id = nid;
- }
setup_bootmem_allocator();
}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int paddr_to_nid(u64 addr)
-{
- int nid;
- unsigned long pfn = PFN_DOWN(addr);
-
- for_each_node(nid)
- if (node_start_pfn[nid] <= pfn &&
- pfn < node_end_pfn[nid])
- return nid;
-
- return -1;
-}
-
-/*
- * This function is used to ask node id BEFORE memmap and mem_section's
- * initialization (pfn_to_nid() can't be used yet).
- * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
- */
-int memory_add_physaddr_to_nid(u64 addr)
-{
- int nid = paddr_to_nid(addr);
- return (nid >= 0) ? nid : 0;
-}
-
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7ffc9b727ef..9405ffc9150 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,851 +2,11 @@
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/init.h>
#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/ctype.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/sched.h>
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/dma.h>
-#include <asm/numa.h>
-#include <asm/acpi.h>
-#include <asm/amd_nb.h>
+#include "numa_internal.h"
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-struct memnode memnode;
-
-s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
- [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-
-int numa_off __initdata;
-static unsigned long __initdata nodemap_addr;
-static unsigned long __initdata nodemap_size;
-
-/*
- * Map cpu index to node index
- */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 1 if OK
- * 0 if memnodmap[] too small (of shift too small)
- * -1 if node overlap or lost ram (shift too big)
- */
-static int __init populate_memnodemap(const struct bootnode *nodes,
- int numnodes, int shift, int *nodeids)
-{
- unsigned long addr, end;
- int i, res = -1;
-
- memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
- for (i = 0; i < numnodes; i++) {
- addr = nodes[i].start;
- end = nodes[i].end;
- if (addr >= end)
- continue;
- if ((end >> shift) >= memnodemapsize)
- return 0;
- do {
- if (memnodemap[addr >> shift] != NUMA_NO_NODE)
- return -1;
-
- if (!nodeids)
- memnodemap[addr >> shift] = i;
- else
- memnodemap[addr >> shift] = nodeids[i];
-
- addr += (1UL << shift);
- } while (addr < end);
- res = 1;
- }
- return res;
-}
-
-static int __init allocate_cachealigned_memnodemap(void)
-{
- unsigned long addr;
-
- memnodemap = memnode.embedded_map;
- if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
- return 0;
-
- addr = 0x8000;
- nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
- nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT,
- nodemap_size, L1_CACHE_BYTES);
- if (nodemap_addr == MEMBLOCK_ERROR) {
- printk(KERN_ERR
- "NUMA: Unable to allocate Memory to Node hash map\n");
- nodemap_addr = nodemap_size = 0;
- return -1;
- }
- memnodemap = phys_to_virt(nodemap_addr);
- memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
-
- printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
- nodemap_addr, nodemap_addr + nodemap_size);
- return 0;
-}
-
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
- int numnodes)
-{
- int i, nodes_used = 0;
- unsigned long start, end;
- unsigned long bitfield = 0, memtop = 0;
-
- for (i = 0; i < numnodes; i++) {
- start = nodes[i].start;
- end = nodes[i].end;
- if (start >= end)
- continue;
- bitfield |= start;
- nodes_used++;
- if (end > memtop)
- memtop = end;
- }
- if (nodes_used <= 1)
- i = 63;
- else
- i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
- memnodemapsize = (memtop >> i)+1;
- return i;
-}
-
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
- int *nodeids)
-{
- int shift;
-
- shift = extract_lsb_from_nodes(nodes, numnodes);
- if (allocate_cachealigned_memnodemap())
- return -1;
- printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
- shift);
-
- if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
- printk(KERN_INFO "Your memory is not aligned you need to "
- "rebuild your kernel with a bigger NODEMAPSIZE "
- "shift=%d\n", shift);
- return -1;
- }
- return shift;
-}
-
-int __meminit __early_pfn_to_nid(unsigned long pfn)
-{
- return phys_to_nid(pfn << PAGE_SHIFT);
-}
-
-static void * __init early_node_mem(int nodeid, unsigned long start,
- unsigned long end, unsigned long size,
- unsigned long align)
-{
- unsigned long mem;
-
- /*
- * put it on high as possible
- * something will go with NODE_DATA
- */
- if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
- start = MAX_DMA_PFN<<PAGE_SHIFT;
- if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
- end > (MAX_DMA32_PFN<<PAGE_SHIFT))
- start = MAX_DMA32_PFN<<PAGE_SHIFT;
- mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
- if (mem != MEMBLOCK_ERROR)
- return __va(mem);
-
- /* extend the search scope */
- end = max_pfn_mapped << PAGE_SHIFT;
- start = MAX_DMA_PFN << PAGE_SHIFT;
- mem = memblock_find_in_range(start, end, size, align);
- if (mem != MEMBLOCK_ERROR)
- return __va(mem);
-
- printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
- size, nodeid);
-
- return NULL;
-}
-
-/* Initialize bootmem allocator for a node */
-void __init
-setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
-{
- unsigned long start_pfn, last_pfn, nodedata_phys;
- const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- int nid;
-
- if (!end)
- return;
-
- /*
- * Don't confuse VM with a node that doesn't have the
- * minimum amount of memory:
- */
- if (end && (end - start) < NODE_MIN_SIZE)
- return;
-
- start = roundup(start, ZONE_ALIGN);
-
- printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
- start, end);
-
- start_pfn = start >> PAGE_SHIFT;
- last_pfn = end >> PAGE_SHIFT;
-
- node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
- SMP_CACHE_BYTES);
- if (node_data[nodeid] == NULL)
- return;
- nodedata_phys = __pa(node_data[nodeid]);
- memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
- printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
- nodedata_phys + pgdat_size - 1);
- nid = phys_to_nid(nodedata_phys);
- if (nid != nodeid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
-
- memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
- NODE_DATA(nodeid)->node_id = nodeid;
- NODE_DATA(nodeid)->node_start_pfn = start_pfn;
- NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
-
- node_set_online(nodeid);
-}
-
-/*
- * There are unfortunately some poorly designed mainboards around that
- * only connect memory to a single CPU. This breaks the 1:1 cpu->node
- * mapping. To avoid this fill in the mapping for all possible CPUs,
- * as the number of CPUs is not known yet. We round robin the existing
- * nodes.
- */
-void __init numa_init_array(void)
-{
- int rr, i;
-
- rr = first_node(node_online_map);
- for (i = 0; i < nr_cpu_ids; i++) {
- if (early_cpu_to_node(i) != NUMA_NO_NODE)
- continue;
- numa_set_node(i, rr);
- rr = next_node(rr, node_online_map);
- if (rr == MAX_NUMNODES)
- rr = first_node(node_online_map);
- }
-}
-
-#ifdef CONFIG_NUMA_EMU
-/* Numa emulation */
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __initdata;
-static char *cmdline __initdata;
-
-static int __init setup_physnodes(unsigned long start, unsigned long end,
- int acpi, int k8)
-{
- int nr_nodes = 0;
- int ret = 0;
- int i;
-
-#ifdef CONFIG_ACPI_NUMA
- if (acpi)
- nr_nodes = acpi_get_nodes(physnodes);
-#endif
-#ifdef CONFIG_K8_NUMA
- if (k8)
- nr_nodes = k8_get_nodes(physnodes);
-#endif
- /*
- * Basic sanity checking on the physical node map: there may be errors
- * if the SRAT or K8 incorrectly reported the topology or the mem=
- * kernel parameter is used.
- */
- for (i = 0; i < nr_nodes; i++) {
- if (physnodes[i].start == physnodes[i].end)
- continue;
- if (physnodes[i].start > end) {
- physnodes[i].end = physnodes[i].start;
- continue;
- }
- if (physnodes[i].end < start) {
- physnodes[i].start = physnodes[i].end;
- continue;
- }
- if (physnodes[i].start < start)
- physnodes[i].start = start;
- if (physnodes[i].end > end)
- physnodes[i].end = end;
- }
-
- /*
- * Remove all nodes that have no memory or were truncated because of the
- * limited address range.
- */
- for (i = 0; i < nr_nodes; i++) {
- if (physnodes[i].start == physnodes[i].end)
- continue;
- physnodes[ret].start = physnodes[i].start;
- physnodes[ret].end = physnodes[i].end;
- ret++;
- }
-
- /*
- * If no physical topology was detected, a single node is faked to cover
- * the entire address space.
- */
- if (!ret) {
- physnodes[ret].start = start;
- physnodes[ret].end = end;
- ret = 1;
- }
- return ret;
-}
-
-/*
- * Setups up nid to range from addr to addr + size. If the end
- * boundary is greater than max_addr, then max_addr is used instead.
- * The return value is 0 if there is additional memory left for
- * allocation past addr and -1 otherwise. addr is adjusted to be at
- * the end of the node.
- */
-static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
+void __init initmem_init(void)
{
- int ret = 0;
- nodes[nid].start = *addr;
- *addr += size;
- if (*addr >= max_addr) {
- *addr = max_addr;
- ret = -1;
- }
- nodes[nid].end = *addr;
- node_set(nid, node_possible_map);
- printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
- nodes[nid].start, nodes[nid].end,
- (nodes[nid].end - nodes[nid].start) >> 20);
- return ret;
+ x86_numa_init();
}
-
-/*
- * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr. The return value is the number of nodes allocated.
- */
-static int __init split_nodes_interleave(u64 addr, u64 max_addr,
- int nr_phys_nodes, int nr_nodes)
-{
- nodemask_t physnode_mask = NODE_MASK_NONE;
- u64 size;
- int big;
- int ret = 0;
- int i;
-
- if (nr_nodes <= 0)
- return -1;
- if (nr_nodes > MAX_NUMNODES) {
- pr_info("numa=fake=%d too large, reducing to %d\n",
- nr_nodes, MAX_NUMNODES);
- nr_nodes = MAX_NUMNODES;
- }
-
- size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
- /*
- * Calculate the number of big nodes that can be allocated as a result
- * of consolidating the remainder.
- */
- big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
- FAKE_NODE_MIN_SIZE;
-
- size &= FAKE_NODE_MIN_HASH_MASK;
- if (!size) {
- pr_err("Not enough memory for each node. "
- "NUMA emulation disabled.\n");
- return -1;
- }
-
- for (i = 0; i < nr_phys_nodes; i++)
- if (physnodes[i].start != physnodes[i].end)
- node_set(i, physnode_mask);
-
- /*
- * Continue to fill physical nodes with fake nodes until there is no
- * memory left on any of them.
- */
- while (nodes_weight(physnode_mask)) {
- for_each_node_mask(i, physnode_mask) {
- u64 end = physnodes[i].start + size;
- u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
-
- if (ret < big)
- end += FAKE_NODE_MIN_SIZE;
-
- /*
- * Continue to add memory to this fake node if its
- * non-reserved memory is less than the per-node size.
- */
- while (end - physnodes[i].start -
- memblock_x86_hole_size(physnodes[i].start, end) < size) {
- end += FAKE_NODE_MIN_SIZE;
- if (end > physnodes[i].end) {
- end = physnodes[i].end;
- break;
- }
- }
-
- /*
- * If there won't be at least FAKE_NODE_MIN_SIZE of
- * non-reserved memory in ZONE_DMA32 for the next node,
- * this one must extend to the boundary.
- */
- if (end < dma32_end && dma32_end - end -
- memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
- end = dma32_end;
-
- /*
- * If there won't be enough non-reserved memory for the
- * next node, this one must extend to the end of the
- * physical node.
- */
- if (physnodes[i].end - end -
- memblock_x86_hole_size(end, physnodes[i].end) < size)
- end = physnodes[i].end;
-
- /*
- * Avoid allocating more nodes than requested, which can
- * happen as a result of rounding down each node's size
- * to FAKE_NODE_MIN_SIZE.
- */
- if (nodes_weight(physnode_mask) + ret >= nr_nodes)
- end = physnodes[i].end;
-
- if (setup_node_range(ret++, &physnodes[i].start,
- end - physnodes[i].start,
- physnodes[i].end) < 0)
- node_clear(i, physnode_mask);
- }
- }
- return ret;
-}
-
-/*
- * Returns the end address of a node so that there is at least `size' amount of
- * non-reserved memory or `max_addr' is reached.
- */
-static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
-{
- u64 end = start + size;
-
- while (end - start - memblock_x86_hole_size(start, end) < size) {
- end += FAKE_NODE_MIN_SIZE;
- if (end > max_addr) {
- end = max_addr;
- break;
- }
- }
- return end;
-}
-
-/*
- * Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'. The return value is the number of nodes allocated.
- */
-static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
-{
- nodemask_t physnode_mask = NODE_MASK_NONE;
- u64 min_size;
- int ret = 0;
- int i;
-
- if (!size)
- return -1;
- /*
- * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
- * increased accordingly if the requested size is too small. This
- * creates a uniform distribution of node sizes across the entire
- * machine (but not necessarily over physical nodes).
- */
- min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
- MAX_NUMNODES;
- min_size = max(min_size, FAKE_NODE_MIN_SIZE);
- if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
- min_size = (min_size + FAKE_NODE_MIN_SIZE) &
- FAKE_NODE_MIN_HASH_MASK;
- if (size < min_size) {
- pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
- size >> 20, min_size >> 20);
- size = min_size;
- }
- size &= FAKE_NODE_MIN_HASH_MASK;
-
- for (i = 0; i < MAX_NUMNODES; i++)
- if (physnodes[i].start != physnodes[i].end)
- node_set(i, physnode_mask);
- /*
- * Fill physical nodes with fake nodes of size until there is no memory
- * left on any of them.
- */
- while (nodes_weight(physnode_mask)) {
- for_each_node_mask(i, physnode_mask) {
- u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
- u64 end;
-
- end = find_end_of_node(physnodes[i].start,
- physnodes[i].end, size);
- /*
- * If there won't be at least FAKE_NODE_MIN_SIZE of
- * non-reserved memory in ZONE_DMA32 for the next node,
- * this one must extend to the boundary.
- */
- if (end < dma32_end && dma32_end - end -
- memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
- end = dma32_end;
-
- /*
- * If there won't be enough non-reserved memory for the
- * next node, this one must extend to the end of the
- * physical node.
- */
- if (physnodes[i].end - end -
- memblock_x86_hole_size(end, physnodes[i].end) < size)
- end = physnodes[i].end;
-
- /*
- * Setup the fake node that will be allocated as bootmem
- * later. If setup_node_range() returns non-zero, there
- * is no more memory available on this physical node.
- */
- if (setup_node_range(ret++, &physnodes[i].start,
- end - physnodes[i].start,
- physnodes[i].end) < 0)
- node_clear(i, physnode_mask);
- }
- }
- return ret;
-}
-
-/*
- * Sets up the system RAM area from start_pfn to last_pfn according to the
- * numa=fake command-line option.
- */
-static int __init numa_emulation(unsigned long start_pfn,
- unsigned long last_pfn, int acpi, int k8)
-{
- u64 addr = start_pfn << PAGE_SHIFT;
- u64 max_addr = last_pfn << PAGE_SHIFT;
- int num_phys_nodes;
- int num_nodes;
- int i;
-
- num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
- /*
- * If the numa=fake command-line contains a 'M' or 'G', it represents
- * the fixed node size. Otherwise, if it is just a single number N,
- * split the system RAM into N fake nodes.
- */
- if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
- u64 size;
-
- size = memparse(cmdline, &cmdline);
- num_nodes = split_nodes_size_interleave(addr, max_addr, size);
- } else {
- unsigned long n;
-
- n = simple_strtoul(cmdline, NULL, 0);
- num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
- }
-
- if (num_nodes < 0)
- return num_nodes;
- memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
- if (memnode_shift < 0) {
- memnode_shift = 0;
- printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
- "disabled.\n");
- return -1;
- }
-
- /*
- * We need to vacate all active ranges that may have been registered for
- * the e820 memory map.
- */
- remove_all_active_ranges();
- for_each_node_mask(i, node_possible_map) {
- memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
- nodes[i].end >> PAGE_SHIFT);
- setup_node_bootmem(i, nodes[i].start, nodes[i].end);
- }
- acpi_fake_nodes(nodes, num_nodes);
- numa_init_array();
- return 0;
-}
-#endif /* CONFIG_NUMA_EMU */
-
-void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
- int acpi, int k8)
-{
- int i;
-
- nodes_clear(node_possible_map);
- nodes_clear(node_online_map);
-
-#ifdef CONFIG_NUMA_EMU
- if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
- return;
- nodes_clear(node_possible_map);
- nodes_clear(node_online_map);
-#endif
-
-#ifdef CONFIG_ACPI_NUMA
- if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
- last_pfn << PAGE_SHIFT))
- return;
- nodes_clear(node_possible_map);
- nodes_clear(node_online_map);
-#endif
-
-#ifdef CONFIG_K8_NUMA
- if (!numa_off && k8 && !k8_scan_nodes())
- return;
- nodes_clear(node_possible_map);
- nodes_clear(node_online_map);
-#endif
- printk(KERN_INFO "%s\n",
- numa_off ? "NUMA turned off" : "No NUMA configuration found");
-
- printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
- start_pfn << PAGE_SHIFT,
- last_pfn << PAGE_SHIFT);
- /* setup dummy node covering all memory */
- memnode_shift = 63;
- memnodemap = memnode.embedded_map;
- memnodemap[0] = 0;
- node_set_online(0);
- node_set(0, node_possible_map);
- for (i = 0; i < nr_cpu_ids; i++)
- numa_set_node(i, 0);
- memblock_x86_register_active_regions(0, start_pfn, last_pfn);
- setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
-}
-
-unsigned long __init numa_free_all_bootmem(void)
-{
- unsigned long pages = 0;
- int i;
-
- for_each_online_node(i)
- pages += free_all_bootmem_node(NODE_DATA(i));
-
- pages += free_all_memory_core_early(MAX_NUMNODES);
-
- return pages;
-}
-
-static __init int numa_setup(char *opt)
-{
- if (!opt)
- return -EINVAL;
- if (!strncmp(opt, "off", 3))
- numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
- if (!strncmp(opt, "fake=", 5))
- cmdline = opt + 5;
-#endif
-#ifdef CONFIG_ACPI_NUMA
- if (!strncmp(opt, "noacpi", 6))
- acpi_numa = -1;
-#endif
- return 0;
-}
-early_param("numa", numa_setup);
-
-#ifdef CONFIG_NUMA
-
-static __init int find_near_online_node(int node)
-{
- int n, val;
- int min_val = INT_MAX;
- int best_node = -1;
-
- for_each_online_node(n) {
- val = node_distance(node, n);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- return best_node;
-}
-
-/*
- * Setup early cpu_to_node.
- *
- * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
- * and apicid_to_node[] tables have valid entries for a CPU.
- * This means we skip cpu_to_node[] initialisation for NUMA
- * emulation and faking node case (when running a kernel compiled
- * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
- * is already initialized in a round robin manner at numa_init_array,
- * prior to this call, and this initialization is good enough
- * for the fake NUMA cases.
- *
- * Called before the per_cpu areas are setup.
- */
-void __init init_cpu_to_node(void)
-{
- int cpu;
- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
-
- BUG_ON(cpu_to_apicid == NULL);
-
- for_each_possible_cpu(cpu) {
- int node;
- u16 apicid = cpu_to_apicid[cpu];
-
- if (apicid == BAD_APICID)
- continue;
- node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE)
- continue;
- if (!node_online(node))
- node = find_near_online_node(node);
- numa_set_node(cpu, node);
- }
-}
-#endif
-
-
-void __cpuinit numa_set_node(int cpu, int node)
-{
- int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
-
- /* early setting, no percpu area yet */
- if (cpu_to_node_map) {
- cpu_to_node_map[cpu] = node;
- return;
- }
-
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
- if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
- printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
- dump_stack();
- return;
- }
-#endif
- per_cpu(x86_cpu_to_node_map, cpu) = node;
-
- if (node != NUMA_NO_NODE)
- set_cpu_numa_node(cpu, node);
-}
-
-void __cpuinit numa_clear_node(int cpu)
-{
- numa_set_node(cpu, NUMA_NO_NODE);
-}
-
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-
-void __cpuinit numa_add_cpu(int cpu)
-{
- cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
- cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
-
-/*
- * --------- debug versions of the numa functions ---------
- */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
- int node = early_cpu_to_node(cpu);
- struct cpumask *mask;
- char buf[64];
-
- mask = node_to_cpumask_map[node];
- if (mask == NULL) {
- printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
- dump_stack();
- return;
- }
-
- if (enable)
- cpumask_set_cpu(cpu, mask);
- else
- cpumask_clear_cpu(cpu, mask);
-
- cpulist_scnprintf(buf, sizeof(buf), mask);
- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
- enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
-}
-
-void __cpuinit numa_add_cpu(int cpu)
-{
- numa_set_cpumask(cpu, 1);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
- numa_set_cpumask(cpu, 0);
-}
-
-int __cpu_to_node(int cpu)
-{
- if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
- printk(KERN_WARNING
- "cpu_to_node(%d): usage too early!\n", cpu);
- dump_stack();
- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
- }
- return per_cpu(x86_cpu_to_node_map, cpu);
-}
-EXPORT_SYMBOL(__cpu_to_node);
-
-/*
- * Same function as cpu_to_node() but used if called before the
- * per_cpu areas are setup.
- */
-int early_cpu_to_node(int cpu)
-{
- if (early_per_cpu_ptr(x86_cpu_to_node_map))
- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
- if (!cpu_possible(cpu)) {
- printk(KERN_WARNING
- "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
- dump_stack();
- return NUMA_NO_NODE;
- }
- return per_cpu(x86_cpu_to_node_map, cpu);
-}
-
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 00000000000..a8f90ce3ded
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,502 @@
+/*
+ * NUMA emulation
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/topology.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <asm/dma.h>
+
+#include "numa_internal.h"
+
+static int emu_nid_to_phys[MAX_NUMNODES];
+static char *emu_cmdline __initdata;
+
+void __init numa_emu_cmdline(char *str)
+{
+ emu_cmdline = str;
+}
+
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].nid == nid)
+ return i;
+ return -ENOENT;
+}
+
+static u64 __init mem_hole_size(u64 start, u64 end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = PFN_DOWN(end);
+
+ if (start_pfn < end_pfn)
+ return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
+ return 0;
+}
+
+/*
+ * Sets up nid to range from @start to @end. The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ int nid, int phys_blk, u64 size)
+{
+ struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+ struct numa_memblk *pb = &pi->blk[phys_blk];
+
+ if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+ return -EINVAL;
+ }
+
+ ei->nr_blks++;
+ eb->start = pb->start;
+ eb->end = pb->start + size;
+ eb->nid = nid;
+
+ if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+ emu_nid_to_phys[nid] = nid;
+
+ pb->start += size;
+ if (pb->start >= pb->end) {
+ WARN_ON_ONCE(pb->start > pb->end);
+ numa_remove_memblk_from(phys_blk, pi);
+ }
+
+ printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+ nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+ return 0;
+}
+
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr. The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, int nr_nodes)
+{
+ nodemask_t physnode_mask = NODE_MASK_NONE;
+ u64 size;
+ int big;
+ int nid = 0;
+ int i, ret;
+
+ if (nr_nodes <= 0)
+ return -1;
+ if (nr_nodes > MAX_NUMNODES) {
+ pr_info("numa=fake=%d too large, reducing to %d\n",
+ nr_nodes, MAX_NUMNODES);
+ nr_nodes = MAX_NUMNODES;
+ }
+
+ /*
+ * Calculate target node size. x86_32 freaks on __udivdi3() so do
+ * the division in ulong number of pages and convert back.
+ */
+ size = max_addr - addr - mem_hole_size(addr, max_addr);
+ size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+
+ /*
+ * Calculate the number of big nodes that can be allocated as a result
+ * of consolidating the remainder.
+ */
+ big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+ FAKE_NODE_MIN_SIZE;
+
+ size &= FAKE_NODE_MIN_HASH_MASK;
+ if (!size) {
+ pr_err("Not enough memory for each node. "
+ "NUMA emulation disabled.\n");
+ return -1;
+ }
+
+ for (i = 0; i < pi->nr_blks; i++)
+ node_set(pi->blk[i].nid, physnode_mask);
+
+ /*
+ * Continue to fill physical nodes with fake nodes until there is no
+ * memory left on any of them.
+ */
+ while (nodes_weight(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+ u64 start, limit, end;
+ int phys_blk;
+
+ phys_blk = emu_find_memblk_by_nid(i, pi);
+ if (phys_blk < 0) {
+ node_clear(i, physnode_mask);
+ continue;
+ }
+ start = pi->blk[phys_blk].start;
+ limit = pi->blk[phys_blk].end;
+ end = start + size;
+
+ if (nid < big)
+ end += FAKE_NODE_MIN_SIZE;
+
+ /*
+ * Continue to add memory to this fake node if its
+ * non-reserved memory is less than the per-node size.
+ */
+ while (end - start - mem_hole_size(start, end) < size) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end > limit) {
+ end = limit;
+ break;
+ }
+ }
+
+ /*
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
+ */
+ if (end < dma32_end && dma32_end - end -
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if (limit - end - mem_hole_size(end, limit) < size)
+ end = limit;
+
+ ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+ phys_blk,
+ min(end, limit) - start);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+ u64 end = start + size;
+
+ while (end - start - mem_hole_size(start, end) < size) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end > max_addr) {
+ end = max_addr;
+ break;
+ }
+ }
+ return end;
+}
+
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'. The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, u64 size)
+{
+ nodemask_t physnode_mask = NODE_MASK_NONE;
+ u64 min_size;
+ int nid = 0;
+ int i, ret;
+
+ if (!size)
+ return -1;
+ /*
+ * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
+ * increased accordingly if the requested size is too small. This
+ * creates a uniform distribution of node sizes across the entire
+ * machine (but not necessarily over physical nodes).
+ */
+ min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
+ min_size = max(min_size, FAKE_NODE_MIN_SIZE);
+ if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
+ min_size = (min_size + FAKE_NODE_MIN_SIZE) &
+ FAKE_NODE_MIN_HASH_MASK;
+ if (size < min_size) {
+ pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+ size >> 20, min_size >> 20);
+ size = min_size;
+ }
+ size &= FAKE_NODE_MIN_HASH_MASK;
+
+ for (i = 0; i < pi->nr_blks; i++)
+ node_set(pi->blk[i].nid, physnode_mask);
+
+ /*
+ * Fill physical nodes with fake nodes of size until there is no memory
+ * left on any of them.
+ */
+ while (nodes_weight(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+ u64 start, limit, end;
+ int phys_blk;
+
+ phys_blk = emu_find_memblk_by_nid(i, pi);
+ if (phys_blk < 0) {
+ node_clear(i, physnode_mask);
+ continue;
+ }
+ start = pi->blk[phys_blk].start;
+ limit = pi->blk[phys_blk].end;
+
+ end = find_end_of_node(start, limit, size);
+ /*
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
+ */
+ if (end < dma32_end && dma32_end - end -
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if (limit - end - mem_hole_size(end, limit) < size)
+ end = limit;
+
+ ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+ phys_blk,
+ min(end, limit) - start);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=fake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success. @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ * emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ * nodes. The distances are determined considering how emulated nodes
+ * are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
+{
+ static struct numa_meminfo ei __initdata;
+ static struct numa_meminfo pi __initdata;
+ const u64 max_addr = PFN_PHYS(max_pfn);
+ u8 *phys_dist = NULL;
+ size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+ int max_emu_nid, dfl_phys_nid;
+ int i, j, ret;
+
+ if (!emu_cmdline)
+ goto no_emu;
+
+ memset(&ei, 0, sizeof(ei));
+ pi = *numa_meminfo;
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ emu_nid_to_phys[i] = NUMA_NO_NODE;
+
+ /*
+ * If the numa=fake command-line contains a 'M' or 'G', it represents
+ * the fixed node size. Otherwise, if it is just a single number N,
+ * split the system RAM into N fake nodes.
+ */
+ if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+ u64 size;
+
+ size = memparse(emu_cmdline, &emu_cmdline);
+ ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+ } else {
+ unsigned long n;
+
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+ ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+ }
+ if (*emu_cmdline == ':')
+ emu_cmdline++;
+
+ if (ret < 0)
+ goto no_emu;
+
+ if (numa_cleanup_meminfo(&ei) < 0) {
+ pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+ goto no_emu;
+ }
+
+ /* copy the physical distance table */
+ if (numa_dist_cnt) {
+ u64 phys;
+
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ phys_size, PAGE_SIZE);
+ if (!phys) {
+ pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+ goto no_emu;
+ }
+ memblock_reserve(phys, phys_size);
+ phys_dist = __va(phys);
+
+ for (i = 0; i < numa_dist_cnt; i++)
+ for (j = 0; j < numa_dist_cnt; j++)
+ phys_dist[i * numa_dist_cnt + j] =
+ node_distance(i, j);
+ }
+
+ /*
+ * Determine the max emulated nid and the default phys nid to use
+ * for unmapped nodes.
+ */
+ max_emu_nid = 0;
+ dfl_phys_nid = NUMA_NO_NODE;
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+ max_emu_nid = i;
+ if (dfl_phys_nid == NUMA_NO_NODE)
+ dfl_phys_nid = emu_nid_to_phys[i];
+ }
+ }
+ if (dfl_phys_nid == NUMA_NO_NODE) {
+ pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
+ goto no_emu;
+ }
+
+ /* commit */
+ *numa_meminfo = ei;
+
+ /*
+ * Transform __apicid_to_node table to use emulated nids by
+ * reverse-mapping phys_nid. The maps should always exist but fall
+ * back to zero just in case.
+ */
+ for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+ if (__apicid_to_node[i] == NUMA_NO_NODE)
+ continue;
+ for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+ if (__apicid_to_node[i] == emu_nid_to_phys[j])
+ break;
+ __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+ }
+
+ /* make sure all emulated nodes are mapped to a physical node */
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+ emu_nid_to_phys[i] = dfl_phys_nid;
+
+ /* transform distance table */
+ numa_reset_distance();
+ for (i = 0; i < max_emu_nid + 1; i++) {
+ for (j = 0; j < max_emu_nid + 1; j++) {
+ int physi = emu_nid_to_phys[i];
+ int physj = emu_nid_to_phys[j];
+ int dist;
+
+ if (get_option(&emu_cmdline, &dist) == 2)
+ ;
+ else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+ dist = physi == physj ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ else
+ dist = phys_dist[physi * numa_dist_cnt + physj];
+
+ numa_set_distance(i, j, dist);
+ }
+ }
+
+ /* free the copied physical distance table */
+ if (phys_dist)
+ memblock_free(__pa(phys_dist), phys_size);
+ return;
+
+no_emu:
+ /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ emu_nid_to_phys[i] = i;
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void numa_add_cpu(int cpu)
+{
+ int physnid, nid;
+
+ nid = early_cpu_to_node(cpu);
+ BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+ physnid = emu_nid_to_phys[nid];
+
+ /*
+ * Map the cpu to each emulated node that is allocated on the physical
+ * node of the cpu's apic id.
+ */
+ for_each_online_node(nid)
+ if (emu_nid_to_phys[nid] == physnid)
+ cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ int i;
+
+ for_each_online_node(i)
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void numa_set_cpumask(int cpu, bool enable)
+{
+ int nid, physnid;
+
+ nid = early_cpu_to_node(cpu);
+ if (nid == NUMA_NO_NODE) {
+ /* early_cpu_to_node() already emits a warning and trace */
+ return;
+ }
+
+ physnid = emu_nid_to_phys[nid];
+
+ for_each_online_node(nid) {
+ if (emu_nid_to_phys[nid] != physnid)
+ continue;
+
+ debug_cpumask_set_cpu(cpu, nid, enable);
+ }
+}
+
+void numa_add_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, false);
+}
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 00000000000..ad86ec91e64
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,33 @@
+#ifndef __X86_MM_NUMA_INTERNAL_H
+#define __X86_MM_NUMA_INTERNAL_H
+
+#include <linux/types.h>
+#include <asm/numa.h>
+
+struct numa_memblk {
+ u64 start;
+ u64 end;
+ int nid;
+};
+
+struct numa_meminfo {
+ int nr_blks;
+ struct numa_memblk blk[NR_NODE_MEMBLKS];
+};
+
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+void __init numa_reset_distance(void);
+
+void __init x86_numa_init(void);
+
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+ int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+ int numa_dist_cnt)
+{ }
+#endif
+
+#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index e1d10690921..6629f397b46 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -8,7 +8,6 @@
#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/mm.h>
#include <asm/cacheflush.h>
@@ -36,7 +35,7 @@ enum {
static int pte_testbit(pte_t pte)
{
- return pte_flags(pte) & _PAGE_UNUSED1;
+ return pte_flags(pte) & _PAGE_SOFTW1;
}
struct split_state {
@@ -68,7 +67,7 @@ static int print_split(struct split_state *s)
s->gpg++;
i += GPS/PAGE_SIZE;
} else if (level == PG_LEVEL_2M) {
- if (!(pte_val(*pte) & _PAGE_PSE)) {
+ if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) {
printk(KERN_ERR
"%lx level %d but not PSE %Lx\n",
addr, level, (u64)pte_val(*pte));
@@ -123,21 +122,19 @@ static int pageattr_test(void)
if (print)
printk(KERN_INFO "CPA self-test:\n");
- bm = vmalloc((max_pfn_mapped + 7) / 8);
+ bm = vzalloc((max_pfn_mapped + 7) / 8);
if (!bm) {
printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
return -ENOMEM;
}
- memset(bm, 0, (max_pfn_mapped + 7) / 8);
failed += print_split(&sa);
- srandom32(100);
for (i = 0; i < NTEST; i++) {
- unsigned long pfn = random32() % max_pfn_mapped;
+ unsigned long pfn = prandom_u32() % max_pfn_mapped;
addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
- len[i] = random32() % 100;
+ len[i] = prandom_u32() % 100;
len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
if (len[i] == 0)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d60..ae242a7c11c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
#include <linux/pfn.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
+#include <linux/pci.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -29,6 +30,7 @@
*/
struct cpa_data {
unsigned long *vaddr;
+ pgd_t *pgd;
pgprot_t mask_set;
pgprot_t mask_clr;
int numpages;
@@ -56,12 +58,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM];
void update_page_count(int level, unsigned long pages)
{
- unsigned long flags;
-
/* Protect against CPA */
- spin_lock_irqsave(&pgd_lock, flags);
+ spin_lock(&pgd_lock);
direct_pages_count[level] += pages;
- spin_unlock_irqrestore(&pgd_lock, flags);
+ spin_unlock(&pgd_lock);
}
static void split_page_count(int level)
@@ -95,12 +95,12 @@ static inline void split_page_count(int level) { }
static inline unsigned long highmap_start_pfn(void)
{
- return __pa(_text) >> PAGE_SHIFT;
+ return __pa_symbol(_text) >> PAGE_SHIFT;
}
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -123,11 +123,11 @@ within(unsigned long addr, unsigned long start, unsigned long end)
/**
* clflush_cache_range - flush a cache range with clflush
- * @addr: virtual start address
+ * @vaddr: virtual start address
* @size: number of bytes to flush
*
- * clflush is an unordered instruction which needs fencing with mfence
- * to avoid ordering issues.
+ * clflushopt is an unordered instruction which needs fencing with mfence or
+ * sfence to avoid ordering issues.
*/
void clflush_cache_range(void *vaddr, unsigned int size)
{
@@ -136,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size)
mb();
for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
- clflush(vaddr);
+ clflushopt(vaddr);
/*
* Flush any possible final partial cacheline:
*/
- clflush(vend);
+ clflushopt(vend);
mb();
}
@@ -260,8 +260,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
* The BIOS area between 640k and 1Mb needs to be executable for
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
*/
- if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+ if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_NX;
+#endif
/*
* The kernel text needs to be executable for obvious reasons
@@ -275,8 +277,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
* The .rodata section needs to be read-only. Using the pfn
* catches all aliases.
*/
- if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
- __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
+ if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
+ __pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
@@ -309,7 +311,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
* these shared mappings are made of small page mappings.
* Thus this don't enforce !RW mapping for small page kernel
* text mapping logic will help Linux Xen parvirt guest boot
- * aswell.
+ * as well.
*/
if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
pgprot_val(forbidden) |= _PAGE_RW;
@@ -322,16 +324,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
}
/*
- * Lookup the page table entry for a virtual address. Return a pointer
- * to the entry and the level of the mapping.
- *
- * Note: We return pud and pmd either when the entry is marked large
- * or when the present bit is not set. Otherwise we would return a
- * pointer to a nonexisting mapping.
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
*/
-pte_t *lookup_address(unsigned long address, unsigned int *level)
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
{
- pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
pmd_t *pmd;
@@ -360,8 +358,62 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
return pte_offset_kernel(pmd, address);
}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+ return lookup_address_in_pgd(pgd_offset_k(address), address, level);
+}
EXPORT_SYMBOL_GPL(lookup_address);
+static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
+ unsigned int *level)
+{
+ if (cpa->pgd)
+ return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
+ address, level);
+
+ return lookup_address(address, level);
+}
+
+/*
+ * This is necessary because __pa() does not work on some
+ * kinds of memory, like vmalloc() or the alloc_remap()
+ * areas on 32-bit NUMA systems. The percpu areas can
+ * end up in this kind of memory, for instance.
+ *
+ * This could be optimized, but it is only intended to be
+ * used at inititalization time, and keeping it
+ * unoptimized should increase the testing coverage for
+ * the more obscure platforms.
+ */
+phys_addr_t slow_virt_to_phys(void *__virt_addr)
+{
+ unsigned long virt_addr = (unsigned long)__virt_addr;
+ phys_addr_t phys_addr;
+ unsigned long offset;
+ enum pg_level level;
+ unsigned long psize;
+ unsigned long pmask;
+ pte_t *pte;
+
+ pte = lookup_address(virt_addr, &level);
+ BUG_ON(!pte);
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ offset = virt_addr & ~pmask;
+ phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ return (phys_addr | offset);
+}
+EXPORT_SYMBOL_GPL(slow_virt_to_phys);
+
/*
* Set the new pmd in all the pgds we know about:
*/
@@ -391,35 +443,32 @@ static int
try_preserve_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
{
- unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
+ unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
pte_t new_pte, old_pte, *tmp;
- pgprot_t old_prot, new_prot;
+ pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
- unsigned int level;
+ enum pg_level level;
if (cpa->force_split)
return 1;
- spin_lock_irqsave(&pgd_lock, flags);
+ spin_lock(&pgd_lock);
/*
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = lookup_address(address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte)
goto out_unlock;
switch (level) {
case PG_LEVEL_2M:
- psize = PMD_PAGE_SIZE;
- pmask = PMD_PAGE_MASK;
- break;
#ifdef CONFIG_X86_64
case PG_LEVEL_1G:
- psize = PUD_PAGE_SIZE;
- pmask = PUD_PAGE_MASK;
- break;
#endif
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ break;
default:
do_split = -EINVAL;
goto out_unlock;
@@ -438,10 +487,23 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* We are safe now. Check whether the new pgprot is the same:
*/
old_pte = *kpte;
- old_prot = new_prot = pte_pgprot(old_pte);
+ old_prot = req_prot = pte_pgprot(old_pte);
- pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
- pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+ pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
+
+ /*
+ * Set the PSE and GLOBAL flags only if the PRESENT flag is
+ * set otherwise pmd_present/pmd_huge will return true even on
+ * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
+ * for the ancient hardware that doesn't support it.
+ */
+ if (pgprot_val(req_prot) & _PAGE_PRESENT)
+ pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
+ else
+ pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
+
+ req_prot = canon_pgprot(req_prot);
/*
* old_pte points to the large page base address. So we need
@@ -450,17 +512,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
- new_prot = static_protections(new_prot, address, pfn);
+ new_prot = static_protections(req_prot, address, pfn);
/*
* We need to check the full range, whether
* static_protection() requires a different pgprot for one of
* the pages in the range we try to preserve:
*/
- addr = address + PAGE_SIZE;
- pfn++;
- for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
- pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+ addr = address & pmask;
+ pfn = pte_pfn(old_pte);
+ for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+ pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
if (pgprot_val(chk_prot) != pgprot_val(new_prot))
goto out_unlock;
@@ -483,49 +545,44 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* that we limited the number of possible pages already to
* the number of pages in the large page.
*/
- if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+ if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
/*
* The address is aligned and the number of pages
* covers the full page.
*/
- new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
+ new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
}
out_unlock:
- spin_unlock_irqrestore(&pgd_lock, flags);
+ spin_unlock(&pgd_lock);
return do_split;
}
-static int split_large_page(pte_t *kpte, unsigned long address)
+static int
+__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
+ struct page *base)
{
- unsigned long flags, pfn, pfninc = 1;
+ pte_t *pbase = (pte_t *)page_address(base);
+ unsigned long pfn, pfninc = 1;
unsigned int i, level;
- pte_t *pbase, *tmp;
+ pte_t *tmp;
pgprot_t ref_prot;
- struct page *base;
-
- if (!debug_pagealloc)
- spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
- if (!debug_pagealloc)
- spin_lock(&cpa_lock);
- if (!base)
- return -ENOMEM;
- spin_lock_irqsave(&pgd_lock, flags);
+ spin_lock(&pgd_lock);
/*
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = lookup_address(address, &level);
- if (tmp != kpte)
- goto out_unlock;
+ tmp = _lookup_address_cpa(cpa, address, &level);
+ if (tmp != kpte) {
+ spin_unlock(&pgd_lock);
+ return 1;
+ }
- pbase = (pte_t *)page_address(base);
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
/*
@@ -539,27 +596,40 @@ static int split_large_page(pte_t *kpte, unsigned long address)
#ifdef CONFIG_X86_64
if (level == PG_LEVEL_1G) {
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
- pgprot_val(ref_prot) |= _PAGE_PSE;
+ /*
+ * Set the PSE flags only if the PRESENT flag is set
+ * otherwise pmd_present/pmd_huge will return true
+ * even on a non present pmd.
+ */
+ if (pgprot_val(ref_prot) & _PAGE_PRESENT)
+ pgprot_val(ref_prot) |= _PAGE_PSE;
+ else
+ pgprot_val(ref_prot) &= ~_PAGE_PSE;
}
#endif
/*
+ * Set the GLOBAL flags only if the PRESENT flag is set
+ * otherwise pmd/pte_present will return true even on a non
+ * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
+ * for the ancient hardware that doesn't support it.
+ */
+ if (pgprot_val(ref_prot) & _PAGE_PRESENT)
+ pgprot_val(ref_prot) |= _PAGE_GLOBAL;
+ else
+ pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
+
+ /*
* Get the target pfn from the original entry:
*/
pfn = pte_pfn(*kpte);
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
- set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
+ set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
- if (address >= (unsigned long)__va(0) &&
- address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+ if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
+ PFN_DOWN(__pa(address)) + 1))
split_page_count(level);
-#ifdef CONFIG_X86_64
- if (address >= (unsigned long)__va(1UL<<32) &&
- address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
- split_page_count(level);
-#endif
-
/*
* Install the new, split up pagetable.
*
@@ -578,24 +648,420 @@ static int split_large_page(pte_t *kpte, unsigned long address)
* going on.
*/
__flush_tlb_all();
+ spin_unlock(&pgd_lock);
- base = NULL;
+ return 0;
+}
+
+static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+ unsigned long address)
+{
+ struct page *base;
+
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
+ base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
+ if (!base)
+ return -ENOMEM;
+
+ if (__split_large_page(cpa, kpte, address, base))
+ __free_page(base);
+
+ return 0;
+}
+
+static bool try_to_free_pte_page(pte_t *pte)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ if (!pte_none(pte[i]))
+ return false;
+
+ free_page((unsigned long)pte);
+ return true;
+}
+
+static bool try_to_free_pmd_page(pmd_t *pmd)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ if (!pmd_none(pmd[i]))
+ return false;
+
+ free_page((unsigned long)pmd);
+ return true;
+}
+
+static bool try_to_free_pud_page(pud_t *pud)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ if (!pud_none(pud[i]))
+ return false;
+
+ free_page((unsigned long)pud);
+ return true;
+}
+
+static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+{
+ pte_t *pte = pte_offset_kernel(pmd, start);
+
+ while (start < end) {
+ set_pte(pte, __pte(0));
+
+ start += PAGE_SIZE;
+ pte++;
+ }
+
+ if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+ pmd_clear(pmd);
+ return true;
+ }
+ return false;
+}
+
+static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+ unsigned long start, unsigned long end)
+{
+ if (unmap_pte_range(pmd, start, end))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, start);
-out_unlock:
/*
- * If we dropped out via the lookup_address check under
- * pgd_lock then stick the page back into the pool:
+ * Not on a 2MB page boundary?
*/
- if (base)
- __free_page(base);
- spin_unlock_irqrestore(&pgd_lock, flags);
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ __unmap_pmd_range(pud, pmd, start, pre_end);
+
+ start = pre_end;
+ pmd++;
+ }
+
+ /*
+ * Try to unmap in 2M chunks.
+ */
+ while (end - start >= PMD_SIZE) {
+ if (pmd_large(*pmd))
+ pmd_clear(pmd);
+ else
+ __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+
+ start += PMD_SIZE;
+ pmd++;
+ }
+
+ /*
+ * 4K leftovers?
+ */
+ if (start < end)
+ return __unmap_pmd_range(pud, pmd, start, end);
+
+ /*
+ * Try again to free the PMD page if haven't succeeded above.
+ */
+ if (!pud_none(*pud))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ pud_t *pud = pud_offset(pgd, start);
+
+ /*
+ * Not on a GB page boundary?
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ unmap_pmd_range(pud, start, pre_end);
+
+ start = pre_end;
+ pud++;
+ }
+ /*
+ * Try to unmap in 1G chunks?
+ */
+ while (end - start >= PUD_SIZE) {
+
+ if (pud_large(*pud))
+ pud_clear(pud);
+ else
+ unmap_pmd_range(pud, start, start + PUD_SIZE);
+
+ start += PUD_SIZE;
+ pud++;
+ }
+
+ /*
+ * 2M leftovers?
+ */
+ if (start < end)
+ unmap_pmd_range(pud, start, end);
+
+ /*
+ * No need to try to free the PUD page because we'll free it in
+ * populate_pgd's error path
+ */
+}
+
+static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
+{
+ pgd_t *pgd_entry = root + pgd_index(addr);
+
+ unmap_pud_range(pgd_entry, addr, end);
+
+ if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
+ pgd_clear(pgd_entry);
+}
+
+static int alloc_pte_page(pmd_t *pmd)
+{
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pte)
+ return -1;
+
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ return 0;
+}
+
+static int alloc_pmd_page(pud_t *pud)
+{
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pmd)
+ return -1;
+
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ return 0;
+}
+
+static void populate_pte(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, start);
+
+ while (num_pages-- && start < end) {
+
+ /* deal with the NX bit */
+ if (!(pgprot_val(pgprot) & _PAGE_NX))
+ cpa->pfn &= ~_PAGE_NX;
+
+ set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+
+ start += PAGE_SIZE;
+ cpa->pfn += PAGE_SIZE;
+ pte++;
+ }
+}
+
+static int populate_pmd(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+{
+ unsigned int cur_pages = 0;
+ pmd_t *pmd;
+
+ /*
+ * Not on a 2M boundary?
+ */
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+
+ pre_end = min_t(unsigned long, pre_end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(unsigned int, num_pages, cur_pages);
+
+ /*
+ * Need a PTE page?
+ */
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
+
+ start = pre_end;
+ }
+
+ /*
+ * We mapped them all?
+ */
+ if (num_pages == cur_pages)
+ return cur_pages;
+
+ while (end - start >= PMD_SIZE) {
+
+ /*
+ * We cannot use a 1G page so allocate a PMD page if needed.
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ pmd = pmd_offset(pud, start);
+
+ set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+ start += PMD_SIZE;
+ cpa->pfn += PMD_SIZE;
+ cur_pages += PMD_SIZE >> PAGE_SHIFT;
+ }
+
+ /*
+ * Map trailing 4K pages.
+ */
+ if (start < end) {
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, end, num_pages - cur_pages,
+ pmd, pgprot);
+ }
+ return num_pages;
+}
+
+static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
+ pgprot_t pgprot)
+{
+ pud_t *pud;
+ unsigned long end;
+ int cur_pages = 0;
+
+ end = start + (cpa->numpages << PAGE_SHIFT);
+
+ /*
+ * Not on a Gb page boundary? => map everything up to it with
+ * smaller pages.
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long pre_end;
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+
+ pre_end = min_t(unsigned long, end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
+
+ pud = pud_offset(pgd, start);
+
+ /*
+ * Need a PMD page?
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
+ pud, pgprot);
+ if (cur_pages < 0)
+ return cur_pages;
+
+ start = pre_end;
+ }
+
+ /* We mapped them all? */
+ if (cpa->numpages == cur_pages)
+ return cur_pages;
+
+ pud = pud_offset(pgd, start);
+
+ /*
+ * Map everything starting from the Gb boundary, possibly with 1G pages
+ */
+ while (end - start >= PUD_SIZE) {
+ set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+ start += PUD_SIZE;
+ cpa->pfn += PUD_SIZE;
+ cur_pages += PUD_SIZE >> PAGE_SHIFT;
+ pud++;
+ }
+
+ /* Map trailing leftover */
+ if (start < end) {
+ int tmp;
+
+ pud = pud_offset(pgd, start);
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
+ pud, pgprot);
+ if (tmp < 0)
+ return cur_pages;
+
+ cur_pages += tmp;
+ }
+ return cur_pages;
+}
+
+/*
+ * Restrictions for kernel page table do not necessarily apply when mapping in
+ * an alternate PGD.
+ */
+static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
+{
+ pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
+ pud_t *pud = NULL; /* shut up gcc */
+ pgd_t *pgd_entry;
+ int ret;
+
+ pgd_entry = cpa->pgd + pgd_index(addr);
+
+ /*
+ * Allocate a PUD page and hand it down for mapping.
+ */
+ if (pgd_none(*pgd_entry)) {
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pud)
+ return -1;
+
+ set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
+
+ ret = populate_pud(cpa, addr, pgd_entry, pgprot);
+ if (ret < 0) {
+ unmap_pgd_range(cpa->pgd, addr,
+ addr + (cpa->numpages << PAGE_SHIFT));
+ return ret;
+ }
+
+ cpa->numpages = ret;
return 0;
}
static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
int primary)
{
+ if (cpa->pgd)
+ return populate_pgd(cpa, vaddr);
+
/*
* Ignore all non primary paths.
*/
@@ -640,7 +1106,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
else
address = *cpa->vaddr;
repeat:
- kpte = lookup_address(address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -659,6 +1125,18 @@ repeat:
new_prot = static_protections(new_prot, address, pfn);
/*
+ * Set the GLOBAL flags only if the PRESENT flag is
+ * set otherwise pte_present will return true even on
+ * a non present pte. The canon_pgprot will clear
+ * _PAGE_GLOBAL for the ancient hardware that doesn't
+ * support it.
+ */
+ if (pgprot_val(new_prot) & _PAGE_PRESENT)
+ pgprot_val(new_prot) |= _PAGE_GLOBAL;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
+
+ /*
* We need to keep the pfn from the existing PTE,
* after all we're only going to change it's attributes
* not the memory it points to
@@ -692,7 +1170,7 @@ repeat:
/*
* We have to split the large page:
*/
- err = split_large_page(kpte, address);
+ err = split_large_page(cpa, kpte, address);
if (!err) {
/*
* Do a global flush tlb after splitting the large page
@@ -728,13 +1206,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
unsigned long vaddr;
int ret;
- if (cpa->pfn >= max_pfn_mapped)
+ if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
return 0;
-#ifdef CONFIG_X86_64
- if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
- return 0;
-#endif
/*
* No need to redo, when the primary call touched the direct
* mapping already:
@@ -845,6 +1319,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
int ret, cache, checkalias;
unsigned long baddr = 0;
+ memset(&cpa, 0, sizeof(cpa));
+
/*
* Check, if we are requested to change a not supported
* feature:
@@ -917,10 +1393,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cache = cache_attr(mask_set);
/*
- * On success we use clflush, when the CPU supports it to
- * avoid the wbindv. If the CPU does not support it and in the
+ * On success we use CLFLUSH, when the CPU supports it to
+ * avoid the WBINVD. If the CPU does not support it and in the
* error case we fall back to cpa_flush_all (which uses
- * wbindv):
+ * WBINVD):
*/
if (!ret && cpu_has_clflush) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
@@ -997,7 +1473,7 @@ out_err:
}
EXPORT_SYMBOL(set_memory_uc);
-int _set_memory_array(unsigned long *addr, int addrinarray,
+static int _set_memory_array(unsigned long *addr, int addrinarray,
unsigned long new_type)
{
int i, j;
@@ -1291,6 +1767,7 @@ static int __set_pages_p(struct page *page, int numpages)
{
unsigned long tempaddr = (unsigned long) page_address(page);
struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.mask_clr = __pgprot(0),
@@ -1309,6 +1786,7 @@ static int __set_pages_np(struct page *page, int numpages)
{
unsigned long tempaddr = (unsigned long) page_address(page);
struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
@@ -1333,12 +1811,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
}
/*
- * If page allocator is not up yet then do not call c_p_a():
- */
- if (!debug_pagealloc_enabled)
- return;
-
- /*
* The return value is ignored as the calls cannot fail.
* Large pages for identity mappings are not used at boot time
* and hence no memory allocations during large page split.
@@ -1353,6 +1825,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
* but that can deadlock->flush only current cpu:
*/
__flush_tlb_all();
+
+ arch_flush_lazy_mmu_mode();
}
#ifdef CONFIG_HIBERNATION
@@ -1373,6 +1847,42 @@ bool kernel_page_present(struct page *page)
#endif /* CONFIG_DEBUG_PAGEALLOC */
+int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+ unsigned numpages, unsigned long page_flags)
+{
+ int retval = -EINVAL;
+
+ struct cpa_data cpa = {
+ .vaddr = &address,
+ .pfn = pfn,
+ .pgd = pgd,
+ .numpages = numpages,
+ .mask_set = __pgprot(0),
+ .mask_clr = __pgprot(0),
+ .flags = 0,
+ };
+
+ if (!(__supported_pte_mask & _PAGE_NX))
+ goto out;
+
+ if (!(page_flags & _PAGE_NX))
+ cpa.mask_clr = __pgprot(_PAGE_NX);
+
+ cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
+
+ retval = __change_page_attr_set_clr(&cpa, 0);
+ __flush_tlb_all();
+
+out:
+ return retval;
+}
+
+void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
+ unsigned numpages)
+{
+ unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
+}
+
/*
* The testcases use internal knowledge of the implementation that shouldn't
* be exposed to the rest of the kernel. Include these directly here.
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b7efa..657438858e8 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,31 +158,47 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
return req_type;
}
+struct pagerange_state {
+ unsigned long cur_pfn;
+ int ram;
+ int not_ram;
+};
+
+static int
+pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
+{
+ struct pagerange_state *state = arg;
+
+ state->not_ram |= initial_pfn > state->cur_pfn;
+ state->ram |= total_nr_pages > 0;
+ state->cur_pfn = initial_pfn + total_nr_pages;
+
+ return state->ram && state->not_ram;
+}
+
static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
{
- int ram_page = 0, not_rampage = 0;
- unsigned long page_nr;
+ int ret = 0;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ struct pagerange_state state = {start_pfn, 0, 0};
- for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
- ++page_nr) {
- /*
- * For legacy reasons, physical address range in the legacy ISA
- * region is tracked as non-RAM. This will allow users of
- * /dev/mem to map portions of legacy ISA region, even when
- * some of those portions are listed(or not even listed) with
- * different e820 types(RAM/reserved/..)
- */
- if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
- page_is_ram(page_nr))
- ram_page = 1;
- else
- not_rampage = 1;
-
- if (ram_page == not_rampage)
- return -1;
+ /*
+ * For legacy reasons, physical address range in the legacy ISA
+ * region is tracked as non-RAM. This will allow users of
+ * /dev/mem to map portions of legacy ISA region, even when
+ * some of those portions are listed(or not even listed) with
+ * different e820 types(RAM/reserved/..)
+ */
+ if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
+ start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
+
+ if (start_pfn < end_pfn) {
+ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ &state, pagerange_is_ram_callback);
}
- return ram_page;
+ return (ret > 0) ? -1 : (state.ram ? 1 : 0);
}
/*
@@ -209,9 +225,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
page = pfn_to_page(pfn);
type = get_page_memtype(page);
if (type != -1) {
- printk(KERN_INFO "reserve_ram_pages_type failed "
- "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
- start, end, type, req_type);
+ printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n",
+ start, end - 1, type, req_type);
if (new_type)
*new_type = type;
@@ -314,9 +329,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
err = rbt_memtype_check_insert(new, new_type);
if (err) {
- printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
- "track %s, req %s\n",
- start, end, cattr_name(new->type), cattr_name(req_type));
+ printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+ start, end - 1,
+ cattr_name(new->type), cattr_name(req_type));
kfree(new);
spin_unlock(&memtype_lock);
@@ -325,8 +340,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
spin_unlock(&memtype_lock);
- dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
- start, end, cattr_name(new->type), cattr_name(req_type),
+ dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+ start, end - 1, cattr_name(new->type), cattr_name(req_type),
new_type ? cattr_name(*new_type) : "-");
return err;
@@ -360,14 +375,14 @@ int free_memtype(u64 start, u64 end)
spin_unlock(&memtype_lock);
if (!entry) {
- printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
- current->comm, current->pid, start, end);
+ printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+ current->comm, current->pid, start, end - 1);
return -EINVAL;
}
kfree(entry);
- dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+ dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
return 0;
}
@@ -491,9 +506,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
while (cursor < to) {
if (!devmem_is_allowed(pfn)) {
- printk(KERN_INFO
- "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
- current->comm, from, to);
+ printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
+ current->comm, from, to - 1);
return 0;
}
cursor += PAGE_SIZE;
@@ -546,20 +560,26 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
- if (base >= __pa(high_memory))
+ if (base > __pa(high_memory-1))
return 0;
- id_sz = (__pa(high_memory) < base + size) ?
+ /*
+ * some areas in the middle of the kernel identity range
+ * are not mapped, like the PCI space.
+ */
+ if (!page_is_ram(base >> PAGE_SHIFT))
+ return 0;
+
+ id_sz = (__pa(high_memory-1) <= base + size) ?
__pa(high_memory) - base :
size;
if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
- printk(KERN_INFO
- "%s:%d ioremap_change_attr failed %s "
- "for %Lx-%Lx\n",
+ printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
+ "for [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid,
cattr_name(flags),
- base, (unsigned long long)(base + size));
+ base, (unsigned long long)(base + size-1));
return -EINVAL;
}
return 0;
@@ -591,12 +611,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
flags = lookup_memtype(paddr);
if (want_flags != flags) {
- printk(KERN_WARNING
- "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+ printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_flags),
(unsigned long long)paddr,
- (unsigned long long)(paddr + size),
+ (unsigned long long)(paddr + size - 1),
cattr_name(flags));
*vma_prot = __pgprot((pgprot_val(*vma_prot) &
(~_PAGE_CACHE_MASK)) |
@@ -614,11 +633,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
!is_new_memtype_allowed(paddr, size, want_flags, flags)) {
free_memtype(paddr, paddr + size);
printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
- " for %Lx-%Lx, got %s\n",
+ " for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_flags),
(unsigned long long)paddr,
- (unsigned long long)(paddr + size),
+ (unsigned long long)(paddr + size - 1),
cattr_name(flags));
return -EINVAL;
}
@@ -652,20 +671,20 @@ static void free_pfn_range(u64 paddr, unsigned long size)
}
/*
- * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * track_pfn_copy is called when vma that is covering the pfnmap gets
* copied through copy_page_range().
*
* If the vma has a linear pfn mapping for the entire range, we get the prot
* from pte and reserve the entire vma range with single reserve_pfn_range call.
*/
-int track_pfn_vma_copy(struct vm_area_struct *vma)
+int track_pfn_copy(struct vm_area_struct *vma)
{
resource_size_t paddr;
unsigned long prot;
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
- if (is_linear_pfn_mapping(vma)) {
+ if (vma->vm_flags & VM_PAT) {
/*
* reserve the whole chunk covered by vma. We need the
* starting address and protection from pte.
@@ -682,31 +701,59 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
}
/*
- * track_pfn_vma_new is called when a _new_ pfn mapping is being established
- * for physical range indicated by pfn and size.
- *
* prot is passed in as a parameter for the new mapping. If the vma has a
* linear pfn mapping for the entire range reserve the entire vma range with
* single reserve_pfn_range call.
*/
-int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
- unsigned long pfn, unsigned long size)
+int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+ unsigned long pfn, unsigned long addr, unsigned long size)
{
+ resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
unsigned long flags;
- resource_size_t paddr;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
- if (is_linear_pfn_mapping(vma)) {
- /* reserve the whole chunk starting from vm_pgoff */
- paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
- return reserve_pfn_range(paddr, vma_size, prot, 0);
+ /* reserve the whole chunk starting from paddr */
+ if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+ int ret;
+
+ ret = reserve_pfn_range(paddr, size, prot, 0);
+ if (!ret)
+ vma->vm_flags |= VM_PAT;
+ return ret;
}
if (!pat_enabled)
return 0;
- /* for vm_insert_pfn and friends, we set prot based on lookup */
- flags = lookup_memtype(pfn << PAGE_SHIFT);
+ /*
+ * For anything smaller than the vma size we set prot based on the
+ * lookup.
+ */
+ flags = lookup_memtype(paddr);
+
+ /* Check memtype for the remaining pages */
+ while (size > PAGE_SIZE) {
+ size -= PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ if (flags != lookup_memtype(paddr))
+ return -EINVAL;
+ }
+
+ *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+ flags);
+
+ return 0;
+}
+
+int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+ unsigned long pfn)
+{
+ unsigned long flags;
+
+ if (!pat_enabled)
+ return 0;
+
+ /* Set prot based on lookup */
+ flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
flags);
@@ -714,22 +761,31 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
}
/*
- * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack_pfn is called while unmapping a pfnmap for a region.
* untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case size can be zero).
+ * can be for the entire vma (in which case pfn, size are zero).
*/
-void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
- unsigned long size)
+void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size)
{
resource_size_t paddr;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
+ unsigned long prot;
- if (is_linear_pfn_mapping(vma)) {
- /* free the whole chunk starting from vm_pgoff */
- paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
- free_pfn_range(paddr, vma_size);
+ if (!(vma->vm_flags & VM_PAT))
return;
+
+ /* free the chunk starting from pfn or the whole chunk */
+ paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ if (!paddr && !size) {
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ size = vma->vm_end - vma->vm_start;
}
+ free_pfn_range(paddr, size);
+ vma->vm_flags &= ~VM_PAT;
}
pgprot_t pgprot_writecombine(pgprot_t prot)
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 8acaddd0fb2..415f6c4ced3 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -12,7 +12,7 @@
#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
#include <linux/sched.h>
#include <linux/gfp.h>
@@ -54,29 +54,24 @@ static u64 get_subtree_max_end(struct rb_node *node)
return ret;
}
-/* Update 'subtree_max_end' for a node, based on node and its children */
-static void memtype_rb_augment_cb(struct rb_node *node, void *__unused)
+static u64 compute_subtree_max_end(struct memtype *data)
{
- struct memtype *data;
- u64 max_end, child_max_end;
-
- if (!node)
- return;
+ u64 max_end = data->end, child_max_end;
- data = container_of(node, struct memtype, rb);
- max_end = data->end;
-
- child_max_end = get_subtree_max_end(node->rb_right);
+ child_max_end = get_subtree_max_end(data->rb.rb_right);
if (child_max_end > max_end)
max_end = child_max_end;
- child_max_end = get_subtree_max_end(node->rb_left);
+ child_max_end = get_subtree_max_end(data->rb.rb_left);
if (child_max_end > max_end)
max_end = child_max_end;
- data->subtree_max_end = max_end;
+ return max_end;
}
+RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
+ u64, subtree_max_end, compute_subtree_max_end)
+
/* Find the first (lowest start addr) overlapping range from rb tree */
static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
u64 start, u64 end)
@@ -179,15 +174,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
struct memtype *data = container_of(*node, struct memtype, rb);
parent = *node;
+ if (data->subtree_max_end < newdata->end)
+ data->subtree_max_end = newdata->end;
if (newdata->start <= data->start)
node = &((*node)->rb_left);
else if (newdata->start > data->start)
node = &((*node)->rb_right);
}
+ newdata->subtree_max_end = newdata->end;
rb_link_node(&newdata->rb, parent, node);
- rb_insert_color(&newdata->rb, root);
- rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL);
+ rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
}
int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
@@ -209,16 +206,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
struct memtype *rbt_memtype_erase(u64 start, u64 end)
{
- struct rb_node *deepest;
struct memtype *data;
data = memtype_rb_exact_match(&memtype_rbroot, start, end);
if (!data)
goto out;
- deepest = rb_augment_erase_begin(&data->rb);
- rb_erase(&data->rb, &memtype_rbroot);
- rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL);
+ rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
out:
return data;
}
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 38e6d174c49..9f0614daea8 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
unsigned char *p;
struct prefix_bits prf;
int i;
- unsigned long rv;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
- if (reg_rop[i] == opcode) {
- rv = REG_READ;
+ if (reg_rop[i] == opcode)
goto do_work;
- }
for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
- if (reg_wop[i] == opcode) {
- rv = REG_WRITE;
+ if (reg_wop[i] == opcode)
goto do_work;
- }
printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
"0x%02x\n", opcode);
@@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
unsigned char *p;
struct prefix_bits prf;
int i;
- unsigned long rv;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
- if (imm_wop[i] == opcode) {
- rv = IMM_WRITE;
+ if (imm_wop[i] == opcode)
goto do_work;
- }
printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
"0x%02x\n", opcode);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc8..6fb6927f9e7 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
struct page *pte;
pte = alloc_pages(__userpte_alloc_gfp, 0);
- if (pte)
- pgtable_page_ctor(pte);
+ if (!pte)
+ return NULL;
+ if (!pgtable_page_ctor(pte)) {
+ __free_page(pte);
+ return NULL;
+ }
return pte;
}
@@ -57,8 +61,17 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
#if PAGETABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
+ struct page *page = virt_to_page(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pmd));
+ /*
+ * NOTE! For PAE, any changes to the top page-directory-pointer-table
+ * entries need a full cr3 reload to flush.
+ */
+#ifdef CONFIG_X86_PAE
+ tlb->need_flush_all = 1;
+#endif
+ pgtable_pmd_page_dtor(page);
+ tlb_remove_page(tlb, page);
}
#if PAGETABLE_LEVELS > 3
@@ -121,14 +134,12 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
static void pgd_dtor(pgd_t *pgd)
{
- unsigned long flags; /* can be called from interrupt context */
-
if (SHARED_KERNEL_PMD)
return;
- spin_lock_irqsave(&pgd_lock, flags);
+ spin_lock(&pgd_lock);
pgd_list_del(pgd);
- spin_unlock_irqrestore(&pgd_lock, flags);
+ spin_unlock(&pgd_lock);
}
/*
@@ -139,7 +150,7 @@ static void pgd_dtor(pgd_t *pgd)
* against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed.
- * -- wli
+ * -- nyc
*/
#ifdef CONFIG_X86_PAE
@@ -170,8 +181,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
* section 8.1: in PAE mode we explicitly have to flush the
* TLB via cr3 if the top-level pgd is changed...
*/
- if (mm == current->active_mm)
- write_cr3(read_cr3());
+ flush_tlb_mm(mm);
}
#else /* !CONFIG_X86_PAE */
@@ -185,8 +195,10 @@ static void free_pmds(pmd_t *pmds[])
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++)
- if (pmds[i])
+ if (pmds[i]) {
+ pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
+ }
}
static int preallocate_pmds(pmd_t *pmds[])
@@ -196,8 +208,13 @@ static int preallocate_pmds(pmd_t *pmds[])
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
- if (pmd == NULL)
+ if (!pmd)
+ failed = true;
+ if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
+ free_page((unsigned long)pmd);
+ pmd = NULL;
failed = true;
+ }
pmds[i] = pmd;
}
@@ -236,7 +253,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
pud_t *pud;
- unsigned long addr;
int i;
if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
@@ -244,8 +260,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
pud = pud_offset(pgd, 0);
- for (addr = i = 0; i < PREALLOCATED_PMDS;
- i++, pud++, addr += PUD_SIZE) {
+ for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
pmd_t *pmd = pmds[i];
if (i >= KERNEL_PGD_BOUNDARY)
@@ -260,7 +275,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pmd_t *pmds[PREALLOCATED_PMDS];
- unsigned long flags;
pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
@@ -280,12 +294,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
* respect to anything walking the pgd_list, so that they
* never see a partially populated pgd.
*/
- spin_lock_irqsave(&pgd_lock, flags);
+ spin_lock(&pgd_lock);
pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
- spin_unlock_irqrestore(&pgd_lock, flags);
+ spin_unlock(&pgd_lock);
return pgd;
@@ -305,6 +319,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
free_page((unsigned long)pgd);
}
+/*
+ * Used to set accessed or dirty bits in the page table entries
+ * on other architectures. On x86, the accessed and dirty bits
+ * are tracked by hardware. However, do_wp_page calls this function
+ * to also make the pte writeable at the same time the dirty bit is
+ * set. In that case we do actually need to write the PTE.
+ */
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
@@ -314,12 +335,35 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*ptep = entry;
pte_update_defer(vma->vm_mm, address, ptep);
- flush_tlb_page(vma, address);
}
return changed;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+ int changed = !pmd_same(*pmdp, entry);
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ if (changed && dirty) {
+ *pmdp = entry;
+ pmd_update_defer(vma->vm_mm, address, pmdp);
+ /*
+ * We had a write-protection fault here and changed the pmd
+ * to to more permissive. No need to flush the TLB for that,
+ * #PF is architecturally guaranteed to do that and in the
+ * worst-case we'll generate a spurious fault.
+ */
+ }
+
+ return changed;
+}
+#endif
+
int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
@@ -335,18 +379,72 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
return ret;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ int ret = 0;
+
+ if (pmd_young(*pmdp))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *)pmdp);
+
+ if (ret)
+ pmd_update(vma->vm_mm, addr, pmdp);
+
+ return ret;
+}
+#endif
+
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
+ /*
+ * On x86 CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+ return ptep_test_and_clear_young(vma, address, ptep);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
int young;
- young = ptep_test_and_clear_young(vma, address, ptep);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
if (young)
- flush_tlb_page(vma, address);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return young;
}
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int set;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+ (unsigned long *)pmdp);
+ if (set) {
+ pmd_update(vma->vm_mm, address, pmdp);
+ /* need tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+}
+#endif
+
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
@@ -358,9 +456,9 @@ void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
BUG_ON(fixmaps_set > 0);
- printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
- (int)-reserve);
- __FIXADDR_TOP = -reserve - PAGE_SIZE;
+ __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
+ printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
+ -reserve, __FIXADDR_TOP + PAGE_SIZE);
#endif
}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cac71849925..4dd8cf65257 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -10,7 +10,6 @@
#include <linux/spinlock.h>
#include <linux/module.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
@@ -128,7 +127,7 @@ static int __init parse_reservetop(char *arg)
address = memparse(arg, &arg);
reserve_top_address(address);
- fixup_early_ioremap();
+ early_ioremap_init();
return 0;
}
early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index d2e2735327b..e666cbbb926 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
+#include <linux/bootmem.h>
#include <linux/mmdebug.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -8,33 +9,54 @@
#ifdef CONFIG_X86_64
+#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
- x += phys_base;
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
} else {
- VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(!phys_addr_valid(x));
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
}
+
return x;
}
EXPORT_SYMBOL(__phys_addr);
+unsigned long __phys_addr_symbol(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* only check upper bounds since lower bounds will trigger carry */
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+ return y + phys_base;
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
+#endif
+
bool __virt_addr_valid(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- if (x >= KERNEL_IMAGE_SIZE)
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ if (y >= KERNEL_IMAGE_SIZE)
return false;
- x += phys_base;
} else {
- if (x < PAGE_OFFSET)
- return false;
- x -= PAGE_OFFSET;
- if (!phys_addr_valid(x))
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ if ((x > y) || !phys_addr_valid(x))
return false;
}
@@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid);
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
+ unsigned long phys_addr = x - PAGE_OFFSET;
/* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
- return x - PAGE_OFFSET;
+ /* max_low_pfn is set early, but not _that_ early */
+ if (max_low_pfn) {
+ VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
+ BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
+ }
+ return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa3408..90555bf60aa 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -5,7 +5,7 @@
#include <asm/pgtable.h>
#include <asm/proto.h>
-static int disable_nx __cpuinitdata;
+static int disable_nx;
/*
* noexec = on|off
@@ -29,7 +29,7 @@ static int __init noexec_setup(char *str)
}
early_param("noexec", noexec_setup);
-void __cpuinit x86_configure_nx(void)
+void x86_configure_nx(void)
{
if (cpu_has_nx && !disable_nx)
__supported_pte_mask |= _PAGE_NX;
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
{
if (!cpu_has_nx) {
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
- "missing in CPU or disabled in BIOS!\n");
+ "missing in CPU!\n");
} else {
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
if (disable_nx) {
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
new file mode 100644
index 00000000000..66338a60aa6
--- /dev/null
+++ b/arch/x86/mm/srat.c
@@ -0,0 +1,222 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+
+int acpi_numa __initdata;
+
+static __init int setup_node(int pxm)
+{
+ return acpi_map_pxm_to_node(pxm);
+}
+
+static __init void bad_srat(void)
+{
+ printk(KERN_ERR "SRAT: SRAT not used.\n");
+ acpi_numa = -1;
+}
+
+static __init inline int srat_disabled(void)
+{
+ return acpi_numa < 0;
+}
+
+/*
+ * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
+ * I/O localities since SRAT does not list them. I/O localities are
+ * not supported at this point.
+ */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+ int i, j;
+
+ for (i = 0; i < slit->locality_count; i++) {
+ const int from_node = pxm_to_node(i);
+
+ if (from_node == NUMA_NO_NODE)
+ continue;
+
+ for (j = 0; j < slit->locality_count; j++) {
+ const int to_node = pxm_to_node(j);
+
+ if (to_node == NUMA_NO_NODE)
+ continue;
+
+ numa_set_distance(from_node, to_node,
+ slit->entry[slit->locality_count * i + j]);
+ }
+ }
+}
+
+/* Callback for Proximity Domain -> x2APIC mapping */
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+ int pxm, node;
+ int apic_id;
+
+ if (srat_disabled())
+ return;
+ if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+ bad_srat();
+ return;
+ }
+ if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+ return;
+ pxm = pa->proximity_domain;
+ apic_id = pa->apic_id;
+ if (!apic->apic_id_valid(apic_id)) {
+ printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
+ pxm, apic_id);
+ return;
+ }
+ node = setup_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+ bad_srat();
+ return;
+ }
+
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
+ set_apicid_to_node(apic_id, node);
+ node_set(node, numa_nodes_parsed);
+ acpi_numa = 1;
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
+ pxm, apic_id, node);
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
+{
+ int pxm, node;
+ int apic_id;
+
+ if (srat_disabled())
+ return;
+ if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
+ bad_srat();
+ return;
+ }
+ if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+ return;
+ pxm = pa->proximity_domain_lo;
+ if (acpi_srat_revision >= 2)
+ pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
+ node = setup_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+ bad_srat();
+ return;
+ }
+
+ if (get_uv_system_type() >= UV_X2APIC)
+ apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+ else
+ apic_id = pa->apic_id;
+
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
+
+ set_apicid_to_node(apic_id, node);
+ node_set(node, numa_nodes_parsed);
+ acpi_numa = 1;
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
+ pxm, apic_id, node);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline int save_add_info(void) {return 1;}
+#else
+static inline int save_add_info(void) {return 0;}
+#endif
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+int __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
+{
+ u64 start, end;
+ u32 hotpluggable;
+ int node, pxm;
+
+ if (srat_disabled())
+ goto out_err;
+ if (ma->header.length != sizeof(struct acpi_srat_mem_affinity))
+ goto out_err_bad_srat;
+ if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+ goto out_err;
+ hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+ if (hotpluggable && !save_add_info())
+ goto out_err;
+
+ start = ma->base_address;
+ end = start + ma->length;
+ pxm = ma->proximity_domain;
+ if (acpi_srat_revision <= 1)
+ pxm &= 0xff;
+
+ node = setup_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+ goto out_err_bad_srat;
+ }
+
+ if (numa_add_memblk(node, start, end) < 0)
+ goto out_err_bad_srat;
+
+ node_set(node, numa_nodes_parsed);
+
+ pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n",
+ node, pxm,
+ (unsigned long long) start, (unsigned long long) end - 1,
+ hotpluggable ? " hotplug" : "");
+
+ /* Mark hotplug range in memblock. */
+ if (hotpluggable && memblock_mark_hotplug(start, ma->length))
+ pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
+ (unsigned long long)start, (unsigned long long)end - 1);
+
+ return 0;
+out_err_bad_srat:
+ bad_srat();
+out_err:
+ return -1;
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+int __init x86_acpi_numa_init(void)
+{
+ int ret;
+
+ ret = acpi_numa_init();
+ if (ret < 0)
+ return ret;
+ return srat_disabled() ? -EINVAL : 0;
+}
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index a17dffd136c..00000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/acpi.h>
-#include <linux/nodemask.h>
-#include <asm/srat.h>
-#include <asm/topology.h>
-#include <asm/smp.h>
-#include <asm/e820.h>
-
-/*
- * proximity macros and definitions
- */
-#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
-#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
-#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
-#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-/* bitmap length; _PXM is at most 255 */
-#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
-static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
-
-#define MAX_CHUNKS_PER_NODE 3
-#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
-struct node_memory_chunk_s {
- unsigned long start_pfn;
- unsigned long end_pfn;
- u8 pxm; // proximity domain of node
- u8 nid; // which cnode contains this chunk?
- u8 bank; // which mem bank on this node
-};
-static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
-
-static int __initdata num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_APICID];
-
-int numa_off __initdata;
-int acpi_numa __initdata;
-
-static __init void bad_srat(void)
-{
- printk(KERN_ERR "SRAT: SRAT not used.\n");
- acpi_numa = -1;
- num_memory_chunks = 0;
-}
-
-static __init inline int srat_disabled(void)
-{
- return numa_off || acpi_numa < 0;
-}
-
-/* Identify CPU proximity domains */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
-{
- if (srat_disabled())
- return;
- if (cpu_affinity->header.length !=
- sizeof(struct acpi_srat_cpu_affinity)) {
- bad_srat();
- return;
- }
-
- if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
- return; /* empty entry */
-
- /* mark this node as "seen" in node bitmap */
- BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
-
- apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
-
- printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
- cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
-}
-
-/*
- * Identify memory proximity domains and hot-remove capabilities.
- * Fill node memory chunk list structure.
- */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
-{
- unsigned long long paddr, size;
- unsigned long start_pfn, end_pfn;
- u8 pxm;
- struct node_memory_chunk_s *p, *q, *pend;
-
- if (srat_disabled())
- return;
- if (memory_affinity->header.length !=
- sizeof(struct acpi_srat_mem_affinity)) {
- bad_srat();
- return;
- }
-
- if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
- return; /* empty entry */
-
- pxm = memory_affinity->proximity_domain & 0xff;
-
- /* mark this node as "seen" in node bitmap */
- BMAP_SET(pxm_bitmap, pxm);
-
- /* calculate info for memory chunk structure */
- paddr = memory_affinity->base_address;
- size = memory_affinity->length;
-
- start_pfn = paddr >> PAGE_SHIFT;
- end_pfn = (paddr + size) >> PAGE_SHIFT;
-
-
- if (num_memory_chunks >= MAXCHUNKS) {
- printk(KERN_WARNING "Too many mem chunks in SRAT."
- " Ignoring %lld MBytes at %llx\n",
- size/(1024*1024), paddr);
- return;
- }
-
- /* Insertion sort based on base address */
- pend = &node_memory_chunk[num_memory_chunks];
- for (p = &node_memory_chunk[0]; p < pend; p++) {
- if (start_pfn < p->start_pfn)
- break;
- }
- if (p < pend) {
- for (q = pend; q >= p; q--)
- *(q + 1) = *q;
- }
- p->start_pfn = start_pfn;
- p->end_pfn = end_pfn;
- p->pxm = pxm;
-
- num_memory_chunks++;
-
- printk(KERN_DEBUG "Memory range %08lx to %08lx"
- " in proximity domain %02x %s\n",
- start_pfn, end_pfn,
- pxm,
- ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
- "enabled and removable" : "enabled" ) );
-}
-
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-}
-
-void acpi_numa_arch_fixup(void)
-{
-}
-/*
- * The SRAT table always lists ascending addresses, so can always
- * assume that the first "start" address that you see is the real
- * start of the node, and that the current "end" address is after
- * the previous one.
- */
-static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
-{
- /*
- * Only add present memory as told by the e820.
- * There is no guarantee from the SRAT that the memory it
- * enumerates is present at boot time because it represents
- * *possible* memory hotplug areas the same as normal RAM.
- */
- if (memory_chunk->start_pfn >= max_pfn) {
- printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
- memory_chunk->start_pfn, memory_chunk->end_pfn);
- return -1;
- }
- if (memory_chunk->nid != nid)
- return -1;
-
- if (!node_has_online_mem(nid))
- node_start_pfn[nid] = memory_chunk->start_pfn;
-
- if (node_start_pfn[nid] > memory_chunk->start_pfn)
- node_start_pfn[nid] = memory_chunk->start_pfn;
-
- if (node_end_pfn[nid] < memory_chunk->end_pfn)
- node_end_pfn[nid] = memory_chunk->end_pfn;
-
- return 0;
-}
-
-int __init get_memcfg_from_srat(void)
-{
- int i, j, nid;
-
-
- if (srat_disabled())
- goto out_fail;
-
- if (num_memory_chunks == 0) {
- printk(KERN_DEBUG
- "could not find any ACPI SRAT memory areas.\n");
- goto out_fail;
- }
-
- /* Calculate total number of nodes in system from PXM bitmap and create
- * a set of sequential node IDs starting at zero. (ACPI doesn't seem
- * to specify the range of _PXM values.)
- */
- /*
- * MCD - we no longer HAVE to number nodes sequentially. PXM domain
- * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
- * 32, so we will continue numbering them in this manner until MAX_NUMNODES
- * approaches MAX_PXM_DOMAINS for i386.
- */
- nodes_clear(node_online_map);
- for (i = 0; i < MAX_PXM_DOMAINS; i++) {
- if (BMAP_TEST(pxm_bitmap, i)) {
- int nid = acpi_map_pxm_to_node(i);
- node_set_online(nid);
- }
- }
- BUG_ON(num_online_nodes() == 0);
-
- /* set cnode id in memory chunk structure */
- for (i = 0; i < num_memory_chunks; i++)
- node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
-
- printk(KERN_DEBUG "pxm bitmap: ");
- for (i = 0; i < sizeof(pxm_bitmap); i++) {
- printk(KERN_CONT "%02x ", pxm_bitmap[i]);
- }
- printk(KERN_CONT "\n");
- printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
- num_online_nodes());
- printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
- num_memory_chunks);
-
- for (i = 0; i < MAX_APICID; i++)
- apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
-
- for (j = 0; j < num_memory_chunks; j++){
- struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
- printk(KERN_DEBUG
- "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
- j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
- if (node_read_chunk(chunk->nid, chunk))
- continue;
-
- memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
- min(chunk->end_pfn, max_pfn));
- }
- /* for out of order entries in SRAT */
- sort_node_map();
-
- for_each_online_node(nid) {
- unsigned long start = node_start_pfn[nid];
- unsigned long end = min(node_end_pfn[nid], max_pfn);
-
- memory_present(nid, start, end);
- node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
- }
- return 1;
-out_fail:
- printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
- " table\n");
- return 0;
-}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
deleted file mode 100644
index a35cb9d8b06..00000000000
--- a/arch/x86/mm/srat_64.c
+++ /dev/null
@@ -1,565 +0,0 @@
-/*
- * ACPI 3.0 based NUMA setup
- * Copyright 2004 Andi Kleen, SuSE Labs.
- *
- * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
- *
- * Called from acpi_numa_init while reading the SRAT and SLIT tables.
- * Assumes all memory regions belonging to a single proximity domain
- * are in one chunk. Holes between them will be included in the node.
- */
-
-#include <linux/kernel.h>
-#include <linux/acpi.h>
-#include <linux/mmzone.h>
-#include <linux/bitmap.h>
-#include <linux/module.h>
-#include <linux/topology.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mm.h>
-#include <asm/proto.h>
-#include <asm/numa.h>
-#include <asm/e820.h>
-#include <asm/apic.h>
-#include <asm/uv/uv.h>
-
-int acpi_numa __initdata;
-
-static struct acpi_table_slit *acpi_slit;
-
-static nodemask_t nodes_parsed __initdata;
-static nodemask_t cpu_nodes_parsed __initdata;
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode nodes_add[MAX_NUMNODES];
-
-static int num_node_memblks __initdata;
-static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
-static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
-
-static __init int setup_node(int pxm)
-{
- return acpi_map_pxm_to_node(pxm);
-}
-
-static __init int conflicting_memblks(unsigned long start, unsigned long end)
-{
- int i;
- for (i = 0; i < num_node_memblks; i++) {
- struct bootnode *nd = &node_memblk_range[i];
- if (nd->start == nd->end)
- continue;
- if (nd->end > start && nd->start < end)
- return memblk_nodeid[i];
- if (nd->end == end && nd->start == start)
- return memblk_nodeid[i];
- }
- return -1;
-}
-
-static __init void cutoff_node(int i, unsigned long start, unsigned long end)
-{
- struct bootnode *nd = &nodes[i];
-
- if (nd->start < start) {
- nd->start = start;
- if (nd->end < nd->start)
- nd->start = nd->end;
- }
- if (nd->end > end) {
- nd->end = end;
- if (nd->start > nd->end)
- nd->start = nd->end;
- }
-}
-
-static __init void bad_srat(void)
-{
- int i;
- printk(KERN_ERR "SRAT: SRAT not used.\n");
- acpi_numa = -1;
- for (i = 0; i < MAX_LOCAL_APIC; i++)
- apicid_to_node[i] = NUMA_NO_NODE;
- for (i = 0; i < MAX_NUMNODES; i++) {
- nodes[i].start = nodes[i].end = 0;
- nodes_add[i].start = nodes_add[i].end = 0;
- }
- remove_all_active_ranges();
-}
-
-static __init inline int srat_disabled(void)
-{
- return numa_off || acpi_numa < 0;
-}
-
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
- unsigned length;
- unsigned long phys;
-
- length = slit->header.length;
- phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
- PAGE_SIZE);
-
- if (phys == MEMBLOCK_ERROR)
- panic(" Can not save slit!\n");
-
- acpi_slit = __va(phys);
- memcpy(acpi_slit, slit, length);
- memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT");
-}
-
-/* Callback for Proximity Domain -> x2APIC mapping */
-void __init
-acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
-{
- int pxm, node;
- int apic_id;
-
- if (srat_disabled())
- return;
- if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
- bad_srat();
- return;
- }
- if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
- return;
- pxm = pa->proximity_domain;
- node = setup_node(pxm);
- if (node < 0) {
- printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
- bad_srat();
- return;
- }
-
- apic_id = pa->apic_id;
- apicid_to_node[apic_id] = node;
- node_set(node, cpu_nodes_parsed);
- acpi_numa = 1;
- printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
- pxm, apic_id, node);
-}
-
-/* Callback for Proximity Domain -> LAPIC mapping */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
-{
- int pxm, node;
- int apic_id;
-
- if (srat_disabled())
- return;
- if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
- bad_srat();
- return;
- }
- if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
- return;
- pxm = pa->proximity_domain_lo;
- node = setup_node(pxm);
- if (node < 0) {
- printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
- bad_srat();
- return;
- }
-
- if (get_uv_system_type() >= UV_X2APIC)
- apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
- else
- apic_id = pa->apic_id;
- apicid_to_node[apic_id] = node;
- node_set(node, cpu_nodes_parsed);
- acpi_numa = 1;
- printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
- pxm, apic_id, node);
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-static inline int save_add_info(void) {return 1;}
-#else
-static inline int save_add_info(void) {return 0;}
-#endif
-/*
- * Update nodes_add[]
- * This code supports one contiguous hot add area per node
- */
-static void __init
-update_nodes_add(int node, unsigned long start, unsigned long end)
-{
- unsigned long s_pfn = start >> PAGE_SHIFT;
- unsigned long e_pfn = end >> PAGE_SHIFT;
- int changed = 0;
- struct bootnode *nd = &nodes_add[node];
-
- /* I had some trouble with strange memory hotadd regions breaking
- the boot. Be very strict here and reject anything unexpected.
- If you want working memory hotadd write correct SRATs.
-
- The node size check is a basic sanity check to guard against
- mistakes */
- if ((signed long)(end - start) < NODE_MIN_SIZE) {
- printk(KERN_ERR "SRAT: Hotplug area too small\n");
- return;
- }
-
- /* This check might be a bit too strict, but I'm keeping it for now. */
- if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
- printk(KERN_ERR
- "SRAT: Hotplug area %lu -> %lu has existing memory\n",
- s_pfn, e_pfn);
- return;
- }
-
- /* Looks good */
-
- if (nd->start == nd->end) {
- nd->start = start;
- nd->end = end;
- changed = 1;
- } else {
- if (nd->start == end) {
- nd->start = start;
- changed = 1;
- }
- if (nd->end == start) {
- nd->end = end;
- changed = 1;
- }
- if (!changed)
- printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
- }
-
- if (changed) {
- node_set(node, cpu_nodes_parsed);
- printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
- nd->start, nd->end);
- }
-}
-
-/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
-{
- struct bootnode *nd, oldnode;
- unsigned long start, end;
- int node, pxm;
- int i;
-
- if (srat_disabled())
- return;
- if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
- bad_srat();
- return;
- }
- if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
- return;
-
- if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
- return;
- start = ma->base_address;
- end = start + ma->length;
- pxm = ma->proximity_domain;
- node = setup_node(pxm);
- if (node < 0) {
- printk(KERN_ERR "SRAT: Too many proximity domains.\n");
- bad_srat();
- return;
- }
- i = conflicting_memblks(start, end);
- if (i == node) {
- printk(KERN_WARNING
- "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
- pxm, start, end, nodes[i].start, nodes[i].end);
- } else if (i >= 0) {
- printk(KERN_ERR
- "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
- pxm, start, end, node_to_pxm(i),
- nodes[i].start, nodes[i].end);
- bad_srat();
- return;
- }
- nd = &nodes[node];
- oldnode = *nd;
- if (!node_test_and_set(node, nodes_parsed)) {
- nd->start = start;
- nd->end = end;
- } else {
- if (start < nd->start)
- nd->start = start;
- if (nd->end < end)
- nd->end = end;
- }
-
- printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
- start, end);
-
- if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
- update_nodes_add(node, start, end);
- /* restore nodes[node] */
- *nd = oldnode;
- if ((nd->start | nd->end) == 0)
- node_clear(node, nodes_parsed);
- }
-
- node_memblk_range[num_node_memblks].start = start;
- node_memblk_range[num_node_memblks].end = end;
- memblk_nodeid[num_node_memblks] = node;
- num_node_memblks++;
-}
-
-/* Sanity check to catch more bad SRATs (they are amazingly common).
- Make sure the PXMs cover all memory. */
-static int __init nodes_cover_memory(const struct bootnode *nodes)
-{
- int i;
- unsigned long pxmram, e820ram;
-
- pxmram = 0;
- for_each_node_mask(i, nodes_parsed) {
- unsigned long s = nodes[i].start >> PAGE_SHIFT;
- unsigned long e = nodes[i].end >> PAGE_SHIFT;
- pxmram += e - s;
- pxmram -= __absent_pages_in_range(i, s, e);
- if ((long)pxmram < 0)
- pxmram = 0;
- }
-
- e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
- /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
- if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
- printk(KERN_ERR
- "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
- (pxmram << PAGE_SHIFT) >> 20,
- (e820ram << PAGE_SHIFT) >> 20);
- return 0;
- }
- return 1;
-}
-
-void __init acpi_numa_arch_fixup(void) {}
-
-int __init acpi_get_nodes(struct bootnode *physnodes)
-{
- int i;
- int ret = 0;
-
- for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
- }
- return ret;
-}
-
-/* Use the information discovered above to actually set up the nodes. */
-int __init acpi_scan_nodes(unsigned long start, unsigned long end)
-{
- int i;
-
- if (acpi_numa <= 0)
- return -1;
-
- /* First clean up the node list */
- for (i = 0; i < MAX_NUMNODES; i++)
- cutoff_node(i, start, end);
-
- /*
- * Join together blocks on the same node, holes between
- * which don't overlap with memory on other nodes.
- */
- for (i = 0; i < num_node_memblks; ++i) {
- int j, k;
-
- for (j = i + 1; j < num_node_memblks; ++j) {
- unsigned long start, end;
-
- if (memblk_nodeid[i] != memblk_nodeid[j])
- continue;
- start = min(node_memblk_range[i].end,
- node_memblk_range[j].end);
- end = max(node_memblk_range[i].start,
- node_memblk_range[j].start);
- for (k = 0; k < num_node_memblks; ++k) {
- if (memblk_nodeid[i] == memblk_nodeid[k])
- continue;
- if (start < node_memblk_range[k].end &&
- end > node_memblk_range[k].start)
- break;
- }
- if (k < num_node_memblks)
- continue;
- start = min(node_memblk_range[i].start,
- node_memblk_range[j].start);
- end = max(node_memblk_range[i].end,
- node_memblk_range[j].end);
- printk(KERN_INFO "SRAT: Node %d "
- "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
- memblk_nodeid[i],
- node_memblk_range[i].start,
- node_memblk_range[i].end,
- node_memblk_range[j].start,
- node_memblk_range[j].end,
- start, end);
- node_memblk_range[i].start = start;
- node_memblk_range[i].end = end;
- k = --num_node_memblks - j;
- memmove(memblk_nodeid + j, memblk_nodeid + j+1,
- k * sizeof(*memblk_nodeid));
- memmove(node_memblk_range + j, node_memblk_range + j+1,
- k * sizeof(*node_memblk_range));
- --j;
- }
- }
-
- memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
- memblk_nodeid);
- if (memnode_shift < 0) {
- printk(KERN_ERR
- "SRAT: No NUMA node hash function found. Contact maintainer\n");
- bad_srat();
- return -1;
- }
-
- for (i = 0; i < num_node_memblks; i++)
- memblock_x86_register_active_regions(memblk_nodeid[i],
- node_memblk_range[i].start >> PAGE_SHIFT,
- node_memblk_range[i].end >> PAGE_SHIFT);
-
- /* for out of order entries in SRAT */
- sort_node_map();
- if (!nodes_cover_memory(nodes)) {
- bad_srat();
- return -1;
- }
-
- /* Account for nodes with cpus and no memory */
- nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
-
- /* Finally register nodes */
- for_each_node_mask(i, node_possible_map)
- setup_node_bootmem(i, nodes[i].start, nodes[i].end);
- /* Try again in case setup_node_bootmem missed one due
- to missing bootmem */
- for_each_node_mask(i, node_possible_map)
- if (!node_online(i))
- setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-
- for (i = 0; i < nr_cpu_ids; i++) {
- int node = early_cpu_to_node(i);
-
- if (node == NUMA_NO_NODE)
- continue;
- if (!node_online(node))
- numa_clear_node(i);
- }
- numa_init_array();
- return 0;
-}
-
-#ifdef CONFIG_NUMA_EMU
-static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
- [0 ... MAX_NUMNODES-1] = PXM_INVAL
-};
-static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
- [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-static int __init find_node_by_addr(unsigned long addr)
-{
- int ret = NUMA_NO_NODE;
- int i;
-
- for_each_node_mask(i, nodes_parsed) {
- /*
- * Find the real node that this emulated node appears on. For
- * the sake of simplicity, we only use a real node's starting
- * address to determine which emulated node it appears on.
- */
- if (addr >= nodes[i].start && addr < nodes[i].end) {
- ret = i;
- break;
- }
- }
- return ret;
-}
-
-/*
- * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
- * mappings that respect the real ACPI topology but reflect our emulated
- * environment. For each emulated node, we find which real node it appears on
- * and create PXM to NID mappings for those fake nodes which mirror that
- * locality. SLIT will now represent the correct distances between emulated
- * nodes as a result of the real topology.
- */
-void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
-{
- int i, j;
-
- printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
- "topology.\n");
- for (i = 0; i < num_nodes; i++) {
- int nid, pxm;
-
- nid = find_node_by_addr(fake_nodes[i].start);
- if (nid == NUMA_NO_NODE)
- continue;
- pxm = node_to_pxm(nid);
- if (pxm == PXM_INVAL)
- continue;
- fake_node_to_pxm_map[i] = pxm;
- /*
- * For each apicid_to_node mapping that exists for this real
- * node, it must now point to the fake node ID.
- */
- for (j = 0; j < MAX_LOCAL_APIC; j++)
- if (apicid_to_node[j] == nid &&
- fake_apicid_to_node[j] == NUMA_NO_NODE)
- fake_apicid_to_node[j] = i;
- }
- for (i = 0; i < num_nodes; i++)
- __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
- memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
-
- nodes_clear(nodes_parsed);
- for (i = 0; i < num_nodes; i++)
- if (fake_nodes[i].start != fake_nodes[i].end)
- node_set(i, nodes_parsed);
-}
-
-static int null_slit_node_compare(int a, int b)
-{
- return node_to_pxm(a) == node_to_pxm(b);
-}
-#else
-static int null_slit_node_compare(int a, int b)
-{
- return a == b;
-}
-#endif /* CONFIG_NUMA_EMU */
-
-int __node_distance(int a, int b)
-{
- int index;
-
- if (!acpi_slit)
- return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
- REMOTE_DISTANCE;
- index = acpi_slit->locality_count * node_to_pxm(a);
- return acpi_slit->entry[index + node_to_pxm(b)];
-}
-
-EXPORT_SYMBOL(__node_distance);
-
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
-int memory_add_physaddr_to_nid(u64 start)
-{
- int i, ret = 0;
-
- for_each_node(i)
- if (nodes_add[i].start <= start && nodes_add[i].end > start)
- ret = i;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 12cdbb17ad1..dd8dda167a2 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
+#include <linux/debugfs.h>
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
= { &init_mm, 0, };
@@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
*
* More scalable flush, from Andi Kleen
*
- * To avoid global state use 8 different call vectors.
- * Each CPU uses a specific vector to trigger flushes on other
- * CPUs. Depending on the received vector the target CPUs look into
- * the right array slot for the flush data.
- *
- * With more than 8 CPUs they are hashed to the 8 available
- * vectors. The limited global vector space forces us to this right now.
- * In future when interrupts are split into per CPU domains this could be
- * fixed, at the cost of triggering multiple IPIs in some cases.
+ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
-union smp_flush_state {
- struct {
- struct mm_struct *flush_mm;
- unsigned long flush_va;
- raw_spinlock_t tlbstate_lock;
- DECLARE_BITMAP(flush_cpumask, NR_CPUS);
- };
- char pad[INTERNODE_CACHE_BYTES];
-} ____cacheline_internodealigned_in_smp;
-
-/* State is put into the per CPU data section, but padded
- to a full cache line because other CPUs can access it and we don't
- want false sharing in the per cpu data segment. */
-static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
-
-static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
+struct flush_tlb_info {
+ struct mm_struct *flush_mm;
+ unsigned long flush_start;
+ unsigned long flush_end;
+};
/*
* We cannot call mmdrop() because we are in interrupt context,
@@ -61,37 +43,36 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
*/
void leave_mm(int cpu)
{
- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- cpumask_clear_cpu(cpu,
- mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
- load_cr3(swapper_pg_dir);
+ if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
+ cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
+ load_cr3(swapper_pg_dir);
+ }
}
EXPORT_SYMBOL_GPL(leave_mm);
/*
- *
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
* 1) switch_mm() either 1a) or 1b)
* 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * for the wrong mm, and in the worst case we perform a superfluous
- * tlb flush.
- * 1a2) set cpu mmu_state to TLBSTATE_OK
- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- * was in lazy tlb mode.
- * 1a3) update cpu active_mm
+ * 1a1) set cpu_tlbstate to TLBSTATE_OK
+ * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
+ * if cpu0 was in lazy tlb mode.
+ * 1a2) update cpu active_mm
* Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
* Now the other cpus will send tlb flush ipis.
* 1a4) change cr3.
+ * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ * Stop ipi delivery for the old mm. This is not synchronized with
+ * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
+ * mm, and in the worst case we perform a superfluous tlb flush.
* 1b) thread switch without mm change
- * cpu active_mm is correct, cpu0 already handles
- * flush ipis.
- * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * cpu active_mm is correct, cpu0 already handles flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
* 1b2) test_and_set the cpu bit in cpu_vm_mask.
* Atomically set the bit [other cpus will start sending flush ipis],
* and test the bit.
@@ -104,206 +85,137 @@ EXPORT_SYMBOL_GPL(leave_mm);
* runs in kernel space, the cpu could load tlb entries for user space
* pages.
*
- * The good news is that cpu mmu_state is local to each cpu, no
+ * The good news is that cpu_tlbstate is local to each cpu, no
* write/read ordering problems.
*/
/*
- * TLB flush IPI:
- *
+ * TLB flush funcation:
* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
* 2) Leave the mm if we are in the lazy tlb mode.
- *
- * Interrupts are disabled.
- */
-
-/*
- * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
- * but still used for documentation purpose but the usage is slightly
- * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
- * entry calls in with the first parameter in %eax. Maybe define
- * intrlinkage?
*/
-#ifdef CONFIG_X86_64
-asmlinkage
-#endif
-void smp_invalidate_interrupt(struct pt_regs *regs)
+static void flush_tlb_func(void *info)
{
- unsigned int cpu;
- unsigned int sender;
- union smp_flush_state *f;
-
- cpu = smp_processor_id();
- /*
- * orig_rax contains the negated interrupt vector.
- * Use that to determine where the sender put the data.
- */
- sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
- f = &flush_state[sender];
-
- if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
- goto out;
- /*
- * This was a BUG() but until someone can quote me the
- * line from the intel manual that guarantees an IPI to
- * multiple CPUs is retried _only_ on the erroring CPUs
- * its staying as a return
- *
- * BUG();
- */
-
- if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_va == TLB_FLUSH_ALL)
- local_flush_tlb();
- else
- __flush_tlb_one(f->flush_va);
- } else
- leave_mm(cpu);
- }
-out:
- ack_APIC_irq();
- smp_mb__before_clear_bit();
- cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
- smp_mb__after_clear_bit();
+ struct flush_tlb_info *f = info;
+
inc_irq_stat(irq_tlb_count);
-}
-static void flush_tlb_others_ipi(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long va)
-{
- unsigned int sender;
- union smp_flush_state *f;
-
- /* Caller has disabled preemption */
- sender = this_cpu_read(tlb_vector_offset);
- f = &flush_state[sender];
-
- /*
- * Could avoid this lock when
- * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
- * probably not worth checking this for a cache-hot lock.
- */
- raw_spin_lock(&f->tlbstate_lock);
-
- f->flush_mm = mm;
- f->flush_va = va;
- if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
- /*
- * We have to send the IPI only to
- * CPUs affected.
- */
- apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
- INVALIDATE_TLB_VECTOR_START + sender);
-
- while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
- cpu_relax();
- }
+ if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+ return;
+
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+ if (f->flush_end == TLB_FLUSH_ALL)
+ local_flush_tlb();
+ else if (!f->flush_end)
+ __flush_tlb_single(f->flush_start);
+ else {
+ unsigned long addr;
+ addr = f->flush_start;
+ while (addr < f->flush_end) {
+ __flush_tlb_single(addr);
+ addr += PAGE_SIZE;
+ }
+ }
+ } else
+ leave_mm(smp_processor_id());
- f->flush_mm = NULL;
- f->flush_va = 0;
- raw_spin_unlock(&f->tlbstate_lock);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long va)
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end)
{
+ struct flush_tlb_info info;
+ info.flush_mm = mm;
+ info.flush_start = start;
+ info.flush_end = end;
+
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (is_uv_system()) {
unsigned int cpu;
- cpu = get_cpu();
- cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+ cpu = smp_processor_id();
+ cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
if (cpumask)
- flush_tlb_others_ipi(cpumask, mm, va);
- put_cpu();
+ smp_call_function_many(cpumask, flush_tlb_func,
+ &info, 1);
return;
}
- flush_tlb_others_ipi(cpumask, mm, va);
+ smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
}
-static void __cpuinit calculate_tlb_offset(void)
-{
- int cpu, node, nr_node_vecs;
- /*
- * we are changing tlb_vector_offset for each CPU in runtime, but this
- * will not cause inconsistency, as the write is atomic under X86. we
- * might see more lock contentions in a short time, but after all CPU's
- * tlb_vector_offset are changed, everything should go normal
- *
- * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
- * waste some vectors.
- **/
- if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
- nr_node_vecs = 1;
- else
- nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
-
- for_each_online_node(node) {
- int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
- nr_node_vecs;
- int cpu_offset = 0;
- for_each_cpu(cpu, cpumask_of_node(node)) {
- per_cpu(tlb_vector_offset, cpu) = node_offset +
- cpu_offset;
- cpu_offset++;
- cpu_offset = cpu_offset % nr_node_vecs;
- }
- }
-}
-
-static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- switch (action & 0xf) {
- case CPU_ONLINE:
- case CPU_DEAD:
- calculate_tlb_offset();
- }
- return NOTIFY_OK;
-}
-
-static int __cpuinit init_smp_flush(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(flush_state); i++)
- raw_spin_lock_init(&flush_state[i].tlbstate_lock);
-
- calculate_tlb_offset();
- hotcpu_notifier(tlb_cpuhp_notify, 0);
- return 0;
-}
-core_initcall(init_smp_flush);
-
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
preempt_disable();
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
+ flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
-void flush_tlb_mm(struct mm_struct *mm)
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long vmflag)
{
+ unsigned long addr;
+ unsigned act_entries, tlb_entries = 0;
+ unsigned long nr_base_pages;
+
preempt_disable();
+ if (current->active_mm != mm)
+ goto flush_all;
- if (current->active_mm == mm) {
- if (current->mm)
- local_flush_tlb();
- else
- leave_mm(smp_processor_id());
+ if (!current->mm) {
+ leave_mm(smp_processor_id());
+ goto flush_all;
}
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
+ if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
+ || vmflag & VM_HUGETLB) {
+ local_flush_tlb();
+ goto flush_all;
+ }
+
+ /* In modern CPU, last level tlb used for both data/ins */
+ if (vmflag & VM_EXEC)
+ tlb_entries = tlb_lli_4k[ENTRIES];
+ else
+ tlb_entries = tlb_lld_4k[ENTRIES];
+
+ /* Assume all of TLB entries was occupied by this task */
+ act_entries = tlb_entries >> tlb_flushall_shift;
+ act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
+ nr_base_pages = (end - start) >> PAGE_SHIFT;
+
+ /* tlb_flushall_shift is on balance point, details in commit log */
+ if (nr_base_pages > act_entries) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ } else {
+ /* flush range by one by one 'invlpg' */
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+ __flush_tlb_single(addr);
+ }
+
+ if (cpumask_any_but(mm_cpumask(mm),
+ smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, start, end);
+ preempt_enable();
+ return;
+ }
+
+flush_all:
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
{
struct mm_struct *mm = vma->vm_mm;
@@ -311,25 +223,105 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
if (current->active_mm == mm) {
if (current->mm)
- __flush_tlb_one(va);
+ __flush_tlb_one(start);
else
leave_mm(smp_processor_id());
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, va);
+ flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
preempt_enable();
}
static void do_flush_tlb_all(void *info)
{
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
}
void flush_tlb_all(void)
{
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
+
+static void do_kernel_range_flush(void *info)
+{
+ struct flush_tlb_info *f = info;
+ unsigned long addr;
+
+ /* flush range by one by one 'invlpg' */
+ for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+ __flush_tlb_single(addr);
+}
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ unsigned act_entries;
+ struct flush_tlb_info info;
+
+ /* In modern CPU, last level tlb used for both data/ins */
+ act_entries = tlb_lld_4k[ENTRIES];
+
+ /* Balance as user space task's flush, a bit conservative */
+ if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
+ (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ else {
+ info.flush_start = start;
+ info.flush_end = end;
+ on_each_cpu(do_kernel_range_flush, &info, 1);
+ }
+}
+
+#ifdef CONFIG_DEBUG_TLBFLUSH
+static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[32];
+ unsigned int len;
+
+ len = sprintf(buf, "%hd\n", tlb_flushall_shift);
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t tlbflush_write_file(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ ssize_t len;
+ s8 shift;
+
+ len = min(count, sizeof(buf) - 1);
+ if (copy_from_user(buf, user_buf, len))
+ return -EFAULT;
+
+ buf[len] = '\0';
+ if (kstrtos8(buf, 0, &shift))
+ return -EINVAL;
+
+ if (shift < -1 || shift >= BITS_PER_LONG)
+ return -EINVAL;
+
+ tlb_flushall_shift = shift;
+ return count;
+}
+
+static const struct file_operations fops_tlbflush = {
+ .read = tlbflush_read_file,
+ .write = tlbflush_write_file,
+ .llseek = default_llseek,
+};
+
+static int __init create_tlb_flushall_shift(void)
+{
+ debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
+ arch_debugfs_dir, NULL, &fops_tlbflush);
+ return 0;
+}
+late_initcall(create_tlb_flushall_shift);
+#endif