aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/fault.c67
-rw-r--r--arch/x86/mm/gup.c10
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init_32.c94
-rw-r--r--arch/x86/mm/init_64.c184
-rw-r--r--arch/x86/mm/ioremap.c197
-rw-r--r--arch/x86/mm/numa_32.c (renamed from arch/x86/mm/discontig_32.c)2
-rw-r--r--arch/x86/mm/numa_64.c10
-rw-r--r--arch/x86/mm/pageattr-test.c9
-rw-r--r--arch/x86/mm/pageattr.c463
-rw-r--r--arch/x86/mm/pat.c132
-rw-r--r--arch/x86/mm/pgtable.c6
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/mm/srat_64.c2
16 files changed, 797 insertions, 393 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index dfb932dcf13..59f89b434b4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
mmiotrace-y := pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
-ifeq ($(CONFIG_X86_32),y)
-obj-$(CONFIG_NUMA) += discontig_32.o
-else
-obj-$(CONFIG_NUMA) += numa_64.o
+obj-$(CONFIG_NUMA) += numa_$(BITS).o
obj-$(CONFIG_K8_NUMA) += k8topology_64.o
-endif
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a20d1fa64b4..e7277cbcfb4 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
- prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
- cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
+ prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
+ cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
if (!st->level) {
/* First entry */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 455f3fe67b4..31e8730fa24 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -35,6 +35,7 @@
#include <asm/tlbflush.h>
#include <asm/proto.h>
#include <asm-generic/sections.h>
+#include <asm/traps.h>
/*
* Page fault error code bits
@@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
return 0;
}
-void do_invalid_op(struct pt_regs *, unsigned long);
-
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
@@ -593,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
unsigned long flags;
#endif
- /*
- * We can fault from pretty much anywhere, with unknown IRQ state.
- */
- trace_hardirqs_fixup();
-
tsk = current;
mm = tsk->mm;
prefetchw(&mm->mmap_sem);
@@ -646,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
-#ifdef CONFIG_X86_32
- /* It's safe to allow irq's after cr2 has been saved and the vmalloc
- fault has been handled. */
- if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
- local_irq_enable();
-
/*
- * If we're in an interrupt, have no user context or are running in an
- * atomic region then we must not take the fault.
+ * It's safe to allow irq's after cr2 has been saved and the
+ * vmalloc fault has been handled.
+ *
+ * User-mode registers count as a user access even for any
+ * potential system fault or CPU buglet.
*/
- if (in_atomic() || !mm)
- goto bad_area_nosemaphore;
-#else /* CONFIG_X86_64 */
- if (likely(regs->flags & X86_EFLAGS_IF))
+ if (user_mode_vm(regs)) {
+ local_irq_enable();
+ error_code |= PF_USER;
+ } else if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
+#ifdef CONFIG_X86_64
if (unlikely(error_code & PF_RSVD))
pgtable_bad(address, regs, error_code);
+#endif
/*
* If we're in an interrupt, have no user context or are running in an
@@ -672,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(in_atomic() || !mm))
goto bad_area_nosemaphore;
- /*
- * User-mode registers count as a user access even for any
- * potential system fault or CPU buglet.
- */
- if (user_mode_vm(regs))
- error_code |= PF_USER;
again:
-#endif
- /* When running in the kernel we expect faults to occur only to
+ /*
+ * When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
* erroneous fault occurring in a code path which already holds mmap_sem
@@ -743,9 +730,6 @@ good_area:
goto bad_area;
}
-#ifdef CONFIG_X86_32
-survive:
-#endif
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
@@ -880,12 +864,11 @@ out_of_memory:
up_read(&mm->mmap_sem);
if (is_global_init(tsk)) {
yield();
-#ifdef CONFIG_X86_32
- down_read(&mm->mmap_sem);
- goto survive;
-#else
+ /*
+ * Re-lookup the vma - in theory the vma tree might
+ * have changed:
+ */
goto again;
-#endif
}
printk("VM: killing process %s\n", tsk->comm);
@@ -915,15 +898,15 @@ LIST_HEAD(pgd_list);
void vmalloc_sync_all(void)
{
-#ifdef CONFIG_X86_32
- unsigned long start = VMALLOC_START & PGDIR_MASK;
unsigned long address;
+#ifdef CONFIG_X86_32
if (SHARED_KERNEL_PMD)
return;
- BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
- for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+ for (address = VMALLOC_START & PMD_MASK;
+ address >= TASK_SIZE && address < FIXADDR_TOP;
+ address += PMD_SIZE) {
unsigned long flags;
struct page *page;
@@ -936,10 +919,8 @@ void vmalloc_sync_all(void)
spin_unlock_irqrestore(&pgd_lock, flags);
}
#else /* CONFIG_X86_64 */
- unsigned long start = VMALLOC_START & PGDIR_MASK;
- unsigned long address;
-
- for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+ for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
+ address += PGDIR_SIZE) {
const pgd_t *pgd_ref = pgd_offset_k(address);
unsigned long flags;
struct page *page;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 007bb06c750..4ba373c5b8c 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_t pte = gup_get_pte(ptep);
struct page *page;
- if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
}
@@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
- if ((pte_val(pte) & mask) != mask)
+ if ((pte_flags(pte) & mask) != mask)
return 0;
/* hugepages are never "special" */
- VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+ VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
refs = 0;
@@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
- if ((pte_val(pte) & mask) != mask)
+ if ((pte_flags(pte) & mask) != mask)
return 0;
/* hugepages are never "special" */
- VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+ VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
refs = 0;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 165c871ba9a..bcc079c282d 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
return (void*) vaddr;
}
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
struct page *kmap_atomic_to_page(void *ptr)
{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index d37f29376b0..8396868e82c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
#include <linux/cpumask.h>
#include <asm/asm.h>
+#include <asm/bios_ebda.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -47,6 +48,7 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
+#include <asm/smp.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -194,11 +196,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
- unsigned pages_2m = 0, pages_4k = 0;
+ unsigned pages_2m, pages_4k;
+ int mapping_iter;
+
+ /*
+ * First iteration will setup identity mapping using large/small pages
+ * based on use_pse, with other attributes same as set by
+ * the early code in head_32.S
+ *
+ * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+ * as desired for the kernel identity mapping.
+ *
+ * This two pass mechanism conforms to the TLB app note which says:
+ *
+ * "Software should not write to a paging-structure entry in a way
+ * that would change, for any linear address, both the page size
+ * and either the page frame or attributes."
+ */
+ mapping_iter = 1;
if (!cpu_has_pse)
use_pse = 0;
+repeat:
+ pages_2m = pages_4k = 0;
pfn = start_pfn;
pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pgd = pgd_base + pgd_idx;
@@ -224,6 +245,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
if (use_pse) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute + _PAGE_PSE.
+ */
+ pgprot_t init_prot =
+ __pgprot(PTE_IDENT_ATTR |
+ _PAGE_PSE);
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
@@ -233,7 +261,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
prot = PAGE_KERNEL_LARGE_EXEC;
pages_2m++;
- set_pmd(pmd, pfn_pmd(pfn, prot));
+ if (mapping_iter == 1)
+ set_pmd(pmd, pfn_pmd(pfn, init_prot));
+ else
+ set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
continue;
@@ -245,17 +276,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
pgprot_t prot = PAGE_KERNEL;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute.
+ */
+ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
if (is_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
pages_4k++;
- set_pte(pte, pfn_pte(pfn, prot));
+ if (mapping_iter == 1)
+ set_pte(pte, pfn_pte(pfn, init_prot));
+ else
+ set_pte(pte, pfn_pte(pfn, prot));
}
}
}
- update_page_count(PG_LEVEL_2M, pages_2m);
- update_page_count(PG_LEVEL_4K, pages_4k);
+ if (mapping_iter == 1) {
+ /*
+ * update direct mapping page count only in the first
+ * iteration.
+ */
+ update_page_count(PG_LEVEL_2M, pages_2m);
+ update_page_count(PG_LEVEL_4K, pages_4k);
+
+ /*
+ * local global flush tlb, which will flush the previous
+ * mappings present in both small and large page TLB's.
+ */
+ __flush_tlb_all();
+
+ /*
+ * Second iteration will set the actual desired PTE attributes.
+ */
+ mapping_iter = 2;
+ goto repeat;
+ }
}
/*
@@ -458,11 +515,7 @@ static void __init pagetable_init(void)
{
pgd_t *pgd_base = swapper_pg_dir;
- paravirt_pagetable_setup_start(pgd_base);
-
permanent_kmaps_init(pgd_base);
-
- paravirt_pagetable_setup_done(pgd_base);
}
#ifdef CONFIG_ACPI_SLEEP
@@ -505,7 +558,7 @@ void zap_low_mappings(void)
int nx_enabled;
-pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
#ifdef CONFIG_X86_PAE
@@ -722,7 +775,7 @@ void __init setup_bootmem_allocator(void)
after_init_bootmem = 1;
}
-static void __init find_early_table_space(unsigned long end)
+static void __init find_early_table_space(unsigned long end, int use_pse)
{
unsigned long puds, pmds, ptes, tables, start;
@@ -732,7 +785,7 @@ static void __init find_early_table_space(unsigned long end)
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
- if (cpu_has_pse) {
+ if (use_pse) {
unsigned long extra;
extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -772,12 +825,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
pgd_t *pgd_base = swapper_pg_dir;
unsigned long start_pfn, end_pfn;
unsigned long big_page_start;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ int use_pse = 0;
+#else
+ int use_pse = cpu_has_pse;
+#endif
/*
* Find space for the kernel direct mapping tables.
*/
if (!after_init_bootmem)
- find_early_table_space(end);
+ find_early_table_space(end, use_pse);
#ifdef CONFIG_X86_PAE
set_nx();
@@ -823,7 +886,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
if (start_pfn < end_pfn)
kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
- cpu_has_pse);
+ use_pse);
/* tail is not big page alignment ? */
start_pfn = end_pfn;
@@ -907,6 +970,8 @@ void __init mem_init(void)
int codesize, reservedpages, datasize, initsize;
int tmp;
+ start_periodic_check_for_corruption();
+
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
@@ -986,7 +1051,6 @@ void __init mem_init(void)
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
- cpa_init();
save_pg_dir();
zap_low_mappings();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d3746efb060..b8e461d4941 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -31,6 +31,7 @@
#include <linux/nmi.h>
#include <asm/processor.h>
+#include <asm/bios_ebda.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -88,6 +89,62 @@ early_param("gbpages", parse_direct_gbpages_on);
int after_bootmem;
+pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+static int do_not_nx __cpuinitdata;
+
+/*
+ * noexec=on|off
+ * Control non-executable mappings for 64-bit processes.
+ *
+ * on Enable (default)
+ * off Disable
+ */
+static int __init nonx_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ do_not_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ do_not_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+}
+early_param("noexec", nonx_setup);
+
+void __cpuinit check_efer(void)
+{
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || do_not_nx)
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+int force_personality32;
+
+/*
+ * noexec32=on|off
+ * Control non executable heap for 32bit processes.
+ * To control the stack too use noexec=off
+ *
+ * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
+ * off PROT_READ implies PROT_EXEC
+ */
+static int __init nonx32_setup(char *str)
+{
+ if (!strcmp(str, "on"))
+ force_personality32 &= ~READ_IMPLIES_EXEC;
+ else if (!strcmp(str, "off"))
+ force_personality32 |= READ_IMPLIES_EXEC;
+ return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -139,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
}
pte = pte_offset_kernel(pmd, vaddr);
- if (!pte_none(*pte) && pte_val(new_pte) &&
- pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
- pte_ERROR(*pte);
set_pte(pte, new_pte);
/*
@@ -225,7 +279,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
- unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
+ unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
pmd_t *last_pmd = pmd + PTRS_PER_PMD;
@@ -256,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
if (pfn >= table_top)
panic("alloc_low_page: ran out of memory");
- adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
+ adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
memset(adr, 0, PAGE_SIZE);
*phys = pfn * PAGE_SIZE;
return adr;
@@ -271,7 +325,8 @@ static __ref void unmap_low_page(void *adr)
}
static unsigned long __meminit
-phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+ pgprot_t prot)
{
unsigned pages = 0;
unsigned long last_map_addr = end;
@@ -289,36 +344,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
break;
}
+ /*
+ * We will re-use the existing mapping.
+ * Xen for example has some special requirements, like mapping
+ * pagetable pages as RO. So assume someone who pre-setup
+ * these mappings are more intelligent.
+ */
if (pte_val(*pte))
continue;
if (0)
printk(" pte=%p addr=%lx pte=%016lx\n",
pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
- set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
- last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
pages++;
+ set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
+ last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
}
+
update_page_count(PG_LEVEL_4K, pages);
return last_map_addr;
}
static unsigned long __meminit
-phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
+ pgprot_t prot)
{
pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
- return phys_pte_init(pte, address, end);
+ return phys_pte_init(pte, address, end, prot);
}
static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, pgprot_t prot)
{
unsigned long pages = 0;
unsigned long last_map_addr = end;
- unsigned long start = address;
int i = pmd_index(address);
@@ -326,6 +388,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
unsigned long pte_phys;
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
+ pgprot_t new_prot = prot;
if (address >= end) {
if (!after_bootmem) {
@@ -339,27 +402,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
last_map_addr = phys_pte_update(pmd, address,
- end);
+ end, prot);
spin_unlock(&init_mm.page_table_lock);
+ continue;
}
- /* Count entries we're using from level2_ident_pgt */
- if (start == 0)
- pages++;
- continue;
+ /*
+ * If we are ok with PG_LEVEL_2M mapping, then we will
+ * use the existing mapping,
+ *
+ * Otherwise, we will split the large page mapping but
+ * use the same existing protection bits except for
+ * large page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_2M))
+ continue;
+ new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
}
if (page_size_mask & (1<<PG_LEVEL_2M)) {
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ pfn_pte(address >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = (address & PMD_MASK) + PMD_SIZE;
continue;
}
pte = alloc_low_page(&pte_phys);
- last_map_addr = phys_pte_init(pte, address, end);
+ last_map_addr = phys_pte_init(pte, address, end, new_prot);
unmap_low_page(pte);
spin_lock(&init_mm.page_table_lock);
@@ -372,12 +448,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
static unsigned long __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, pgprot_t prot)
{
pmd_t *pmd = pmd_offset(pud, 0);
unsigned long last_map_addr;
- last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
+ last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
__flush_tlb_all();
return last_map_addr;
}
@@ -394,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
unsigned long pmd_phys;
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
+ pgprot_t prot = PAGE_KERNEL;
if (addr >= end)
break;
@@ -405,10 +482,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
}
if (pud_val(*pud)) {
- if (!pud_large(*pud))
+ if (!pud_large(*pud)) {
last_map_addr = phys_pmd_update(pud, addr, end,
- page_size_mask);
- continue;
+ page_size_mask, prot);
+ continue;
+ }
+ /*
+ * If we are ok with PG_LEVEL_1G mapping, then we will
+ * use the existing mapping.
+ *
+ * Otherwise, we will split the gbpage mapping but use
+ * the same existing protection bits except for large
+ * page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_1G))
+ continue;
+ prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
}
if (page_size_mask & (1<<PG_LEVEL_1G)) {
@@ -422,7 +515,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
}
pmd = alloc_low_page(&pmd_phys);
- last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
+ last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
+ prot);
unmap_low_page(pmd);
spin_lock(&init_mm.page_table_lock);
@@ -430,6 +524,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
+
update_page_count(PG_LEVEL_1G, pages);
return last_map_addr;
@@ -446,27 +541,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
return phys_pud_init(pud, addr, end, page_size_mask);
}
-static void __init find_early_table_space(unsigned long end)
+static void __init find_early_table_space(unsigned long end, int use_pse,
+ int use_gbpages)
{
unsigned long puds, pmds, ptes, tables, start;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
- if (direct_gbpages) {
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ if (use_gbpages) {
unsigned long extra;
extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
} else
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
- if (cpu_has_pse) {
+ if (use_pse) {
unsigned long extra;
extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
} else
ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
+ tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
/*
* RED-PEN putting page tables only on node 0 could
@@ -528,6 +624,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
pgd_populate(&init_mm, pgd, __va(pud_phys));
spin_unlock(&init_mm.page_table_lock);
}
+ __flush_tlb_all();
return last_map_addr;
}
@@ -571,6 +668,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
struct map_range mr[NR_RANGE_MR];
int nr_range, i;
+ int use_pse, use_gbpages;
printk(KERN_INFO "init_memory_mapping\n");
@@ -584,9 +682,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
if (!after_bootmem)
init_gbpages();
- if (direct_gbpages)
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ use_pse = use_gbpages = 0;
+#else
+ use_pse = cpu_has_pse;
+ use_gbpages = direct_gbpages;
+#endif
+
+ if (use_gbpages)
page_size_mask |= 1 << PG_LEVEL_1G;
- if (cpu_has_pse)
+ if (use_pse)
page_size_mask |= 1 << PG_LEVEL_2M;
memset(mr, 0, sizeof(mr));
@@ -636,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
old_start = mr[i].start;
memmove(&mr[i], &mr[i+1],
(nr_range - 1 - i) * sizeof (struct map_range));
- mr[i].start = old_start;
+ mr[i--].start = old_start;
nr_range--;
}
@@ -647,7 +757,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
if (!after_bootmem)
- find_early_table_space(end);
+ find_early_table_space(end, use_pse, use_gbpages);
for (i = 0; i < nr_range; i++)
last_map_addr = kernel_physical_mapping_init(
@@ -769,6 +879,8 @@ void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
+ start_periodic_check_for_corruption();
+
pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
@@ -806,8 +918,6 @@ void __init mem_init(void)
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10);
-
- cpa_init();
}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d4b6e6a29ae..ae71e11eb3e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,18 +24,47 @@
#ifdef CONFIG_X86_64
+static inline int phys_addr_valid(unsigned long addr)
+{
+ return addr < (1UL << boot_cpu_data.x86_phys_bits);
+}
+
unsigned long __phys_addr(unsigned long x)
{
- if (x >= __START_KERNEL_map)
- return x - __START_KERNEL_map + phys_base;
- return x - PAGE_OFFSET;
+ if (x >= __START_KERNEL_map) {
+ x -= __START_KERNEL_map;
+ VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
+ x += phys_base;
+ } else {
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ x -= PAGE_OFFSET;
+ VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
+ !phys_addr_valid(x));
+ }
+ return x;
}
EXPORT_SYMBOL(__phys_addr);
-static inline int phys_addr_valid(unsigned long addr)
+bool __virt_addr_valid(unsigned long x)
{
- return addr < (1UL << boot_cpu_data.x86_phys_bits);
+ if (x >= __START_KERNEL_map) {
+ x -= __START_KERNEL_map;
+ if (x >= KERNEL_IMAGE_SIZE)
+ return false;
+ x += phys_base;
+ } else {
+ if (x < PAGE_OFFSET)
+ return false;
+ x -= PAGE_OFFSET;
+ if (system_state == SYSTEM_BOOTING ?
+ x > MAXMEM : !phys_addr_valid(x)) {
+ return false;
+ }
+ }
+
+ return pfn_valid(x >> PAGE_SHIFT);
}
+EXPORT_SYMBOL(__virt_addr_valid);
#else
@@ -44,6 +73,28 @@ static inline int phys_addr_valid(unsigned long addr)
return 1;
}
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+ /* VMALLOC_* aren't constants; not available at the boot time */
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
+ is_vmalloc_addr((void *) x));
+ return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+ if (x < PAGE_OFFSET)
+ return false;
+ if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+ return false;
+ return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
#endif
int page_is_ram(unsigned long pagenr)
@@ -83,6 +134,25 @@ int page_is_ram(unsigned long pagenr)
return 0;
}
+int pagerange_is_ram(unsigned long start, unsigned long end)
+{
+ int ram_page = 0, not_rampage = 0;
+ unsigned long page_nr;
+
+ for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
+ ++page_nr) {
+ if (page_is_ram(page_nr))
+ ram_page = 1;
+ else
+ not_rampage = 1;
+
+ if (ram_page == not_rampage)
+ return -1;
+ }
+
+ return ram_page;
+}
+
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
@@ -150,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
return (__force void __iomem *)phys_to_virt(phys_addr);
/*
+ * Check if the request spans more than any BAR in the iomem resource
+ * tree.
+ */
+ WARN_ON(iomem_map_sanity_check(phys_addr, size));
+
+ /*
* Don't allow anybody to remap normal RAM that we're using..
*/
for (pfn = phys_addr >> PAGE_SHIFT;
@@ -204,16 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
switch (prot_val) {
case _PAGE_CACHE_UC:
default:
- prot = PAGE_KERNEL_NOCACHE;
+ prot = PAGE_KERNEL_IO_NOCACHE;
break;
case _PAGE_CACHE_UC_MINUS:
- prot = PAGE_KERNEL_UC_MINUS;
+ prot = PAGE_KERNEL_IO_UC_MINUS;
break;
case _PAGE_CACHE_WC:
- prot = PAGE_KERNEL_WC;
+ prot = PAGE_KERNEL_IO_WC;
break;
case _PAGE_CACHE_WB:
- prot = PAGE_KERNEL;
+ prot = PAGE_KERNEL_IO;
break;
}
@@ -421,7 +497,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
return;
}
-int __initdata early_ioremap_debug;
+static int __initdata early_ioremap_debug;
static int __init early_ioremap_debug_setup(char *str)
{
@@ -530,12 +606,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
}
static inline void __init early_set_fixmap(enum fixed_addresses idx,
- unsigned long phys)
+ unsigned long phys, pgprot_t prot)
{
if (after_paging_init)
- set_fixmap(idx, phys);
+ __set_fixmap(idx, phys, prot);
else
- __early_set_fixmap(idx, phys, PAGE_KERNEL);
+ __early_set_fixmap(idx, phys, prot);
}
static inline void __init early_clear_fixmap(enum fixed_addresses idx)
@@ -546,16 +622,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
__early_set_fixmap(idx, 0, __pgprot(0));
}
-
-int __initdata early_ioremap_nested;
-
+static void *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
static int __init check_early_ioremap_leak(void)
{
- if (!early_ioremap_nested)
+ int count = 0;
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (prev_map[i])
+ count++;
+
+ if (!count)
return 0;
WARN(1, KERN_WARNING
"Debug warning: early ioremap leak of %d areas detected.\n",
- early_ioremap_nested);
+ count);
printk(KERN_WARNING
"please boot with early_ioremap_debug and report the dmesg.\n");
@@ -563,18 +645,33 @@ static int __init check_early_ioremap_leak(void)
}
late_initcall(check_early_ioremap_leak);
-void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
{
unsigned long offset, last_addr;
- unsigned int nrpages, nesting;
+ unsigned int nrpages;
enum fixed_addresses idx0, idx;
+ int i, slot;
WARN_ON(system_state != SYSTEM_BOOTING);
- nesting = early_ioremap_nested;
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (!prev_map[i]) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (slot < 0) {
+ printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
+ phys_addr, size);
+ WARN_ON(1);
+ return NULL;
+ }
+
if (early_ioremap_debug) {
printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
- phys_addr, size, nesting);
+ phys_addr, size, slot);
dump_stack();
}
@@ -585,17 +682,13 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
return NULL;
}
- if (nesting >= FIX_BTMAPS_NESTING) {
- WARN_ON(1);
- return NULL;
- }
- early_ioremap_nested++;
+ prev_size[slot] = size;
/*
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr) - phys_addr;
+ size = PAGE_ALIGN(last_addr + 1) - phys_addr;
/*
* Mappings have to fit in the FIX_BTMAP area.
@@ -609,10 +702,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
/*
* Ok, go for it..
*/
- idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+ idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
idx = idx0;
while (nrpages > 0) {
- early_set_fixmap(idx, phys_addr);
+ early_set_fixmap(idx, phys_addr, prot);
phys_addr += PAGE_SIZE;
--idx;
--nrpages;
@@ -620,7 +713,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
if (early_ioremap_debug)
printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
- return (void *) (offset + fix_to_virt(idx0));
+ prev_map[slot] = (void *) (offset + fix_to_virt(idx0));
+ return prev_map[slot];
+}
+
+/* Remap an IO device */
+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+{
+ return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
+}
+
+/* Remap memory */
+void __init *early_memremap(unsigned long phys_addr, unsigned long size)
+{
+ return __early_ioremap(phys_addr, size, PAGE_KERNEL);
}
void __init early_iounmap(void *addr, unsigned long size)
@@ -629,15 +735,33 @@ void __init early_iounmap(void *addr, unsigned long size)
unsigned long offset;
unsigned int nrpages;
enum fixed_addresses idx;
- int nesting;
+ int i, slot;
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (prev_map[i] == addr) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (slot < 0) {
+ printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
+ addr, size);
+ WARN_ON(1);
+ return;
+ }
- nesting = --early_ioremap_nested;
- if (WARN_ON(nesting < 0))
+ if (prev_size[slot] != size) {
+ printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+ addr, size, slot, prev_size[slot]);
+ WARN_ON(1);
return;
+ }
if (early_ioremap_debug) {
printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
- size, nesting);
+ size, slot);
dump_stack();
}
@@ -649,12 +773,13 @@ void __init early_iounmap(void *addr, unsigned long size)
offset = virt_addr & ~PAGE_MASK;
nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
- idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
while (nrpages > 0) {
early_clear_fixmap(idx);
--idx;
--nrpages;
}
+ prev_map[slot] = 0;
}
void __this_fixmap_does_not_exist(void)
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c
index 62fa440678d..847c164725f 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
get_memcfg_numa();
- kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
+ kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
do {
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a4dd793d600..cebcbf152d4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
return 0;
addr = 0x8000;
- nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
+ nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
nodemap_size, L1_CACHE_BYTES);
if (nodemap_addr == -1UL) {
@@ -176,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
unsigned long bootmap_start, nodedata_phys;
void *bootmap;
- const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+ const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
int nid;
- start = round_up(start, ZONE_ALIGN);
+ start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
start, end);
@@ -210,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
nid = phys_to_nid(nodedata_phys);
if (nid == nodeid)
- bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+ bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
else
- bootmap_start = round_up(start, PAGE_SIZE);
+ bootmap_start = roundup(start, PAGE_SIZE);
/*
* SMP_CACHE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d4aa503caaa..e1d10690921 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -32,7 +32,7 @@ enum {
GPS = (1<<30)
};
-#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1)
+#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
static int pte_testbit(pte_t pte)
{
@@ -118,6 +118,7 @@ static int pageattr_test(void)
unsigned int level;
int i, k;
int err;
+ unsigned long test_addr;
if (print)
printk(KERN_INFO "CPA self-test:\n");
@@ -172,7 +173,8 @@ static int pageattr_test(void)
continue;
}
- err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT);
+ test_addr = addr[i];
+ err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA %d failed %d\n", i, err);
failed++;
@@ -204,7 +206,8 @@ static int pageattr_test(void)
failed++;
continue;
}
- err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT);
+ test_addr = addr[i];
+ err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA reverting failed: %d\n", err);
failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 43e2f8483e4..a9ec89c3fbc 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,27 @@
* The current flushing context - we pass it instead of 5 arguments:
*/
struct cpa_data {
- unsigned long vaddr;
+ unsigned long *vaddr;
pgprot_t mask_set;
pgprot_t mask_clr;
int numpages;
- int flushtlb;
+ int flags;
unsigned long pfn;
unsigned force_split : 1;
+ int curpage;
};
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -84,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -190,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
}
}
+static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+{
+ unsigned int i, level;
+ unsigned long *addr;
+
+ BUG_ON(irqs_disabled());
+
+ on_each_cpu(__cpa_flush_range, NULL, 1);
+
+ if (!cache)
+ return;
+
+ /* 4M threshold */
+ if (numpages >= 1024) {
+ if (boot_cpu_data.x86_model >= 4)
+ wbinvd();
+ return;
+ }
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0, addr = start; i < numpages; i++, addr++) {
+ pte_t *pte = lookup_address(*addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+ clflush_cache_range((void *) *addr, PAGE_SIZE);
+ }
+}
+
/*
* Certain areas of memory on x86 require very specific protection flags,
* for example the BIOS area or kernel text. Callers don't always get this
@@ -398,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
*/
new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
__set_pmd_pte(kpte, address, new_pte);
- cpa->flushtlb = 1;
+ cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
}
@@ -408,84 +455,6 @@ out_unlock:
return do_split;
}
-static LIST_HEAD(page_pool);
-static unsigned long pool_size, pool_pages, pool_low;
-static unsigned long pool_used, pool_failed;
-
-static void cpa_fill_pool(struct page **ret)
-{
- gfp_t gfp = GFP_KERNEL;
- unsigned long flags;
- struct page *p;
-
- /*
- * Avoid recursion (on debug-pagealloc) and also signal
- * our priority to get to these pagetables:
- */
- if (current->flags & PF_MEMALLOC)
- return;
- current->flags |= PF_MEMALLOC;
-
- /*
- * Allocate atomically from atomic contexts:
- */
- if (in_atomic() || irqs_disabled() || debug_pagealloc)
- gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-
- while (pool_pages < pool_size || (ret && !*ret)) {
- p = alloc_pages(gfp, 0);
- if (!p) {
- pool_failed++;
- break;
- }
- /*
- * If the call site needs a page right now, provide it:
- */
- if (ret && !*ret) {
- *ret = p;
- continue;
- }
- spin_lock_irqsave(&pgd_lock, flags);
- list_add(&p->lru, &page_pool);
- pool_pages++;
- spin_unlock_irqrestore(&pgd_lock, flags);
- }
-
- current->flags &= ~PF_MEMALLOC;
-}
-
-#define SHIFT_MB (20 - PAGE_SHIFT)
-#define ROUND_MB_GB ((1 << 10) - 1)
-#define SHIFT_MB_GB 10
-#define POOL_PAGES_PER_GB 16
-
-void __init cpa_init(void)
-{
- struct sysinfo si;
- unsigned long gb;
-
- si_meminfo(&si);
- /*
- * Calculate the number of pool pages:
- *
- * Convert totalram (nr of pages) to MiB and round to the next
- * GiB. Shift MiB to Gib and multiply the result by
- * POOL_PAGES_PER_GB:
- */
- if (debug_pagealloc) {
- gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
- pool_size = POOL_PAGES_PER_GB * gb;
- } else {
- pool_size = 1;
- }
- pool_low = pool_size;
-
- cpa_fill_pool(NULL);
- printk(KERN_DEBUG
- "CPA: page pool initialized %lu of %lu pages preallocated\n",
- pool_pages, pool_size);
-}
-
static int split_large_page(pte_t *kpte, unsigned long address)
{
unsigned long flags, pfn, pfninc = 1;
@@ -494,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
pgprot_t ref_prot;
struct page *base;
- /*
- * Get a page from the pool. The pool list is protected by the
- * pgd_lock, which we have to take anyway for the split
- * operation:
- */
- spin_lock_irqsave(&pgd_lock, flags);
- if (list_empty(&page_pool)) {
- spin_unlock_irqrestore(&pgd_lock, flags);
- base = NULL;
- cpa_fill_pool(&base);
- if (!base)
- return -ENOMEM;
- spin_lock_irqsave(&pgd_lock, flags);
- } else {
- base = list_first_entry(&page_pool, struct page, lru);
- list_del(&base->lru);
- pool_pages--;
-
- if (pool_pages < pool_low)
- pool_low = pool_pages;
- }
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
+ base = alloc_pages(GFP_KERNEL, 0);
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
+ if (!base)
+ return -ENOMEM;
+ spin_lock_irqsave(&pgd_lock, flags);
/*
* Check for races, another CPU might have split this page
* up for us already:
@@ -572,11 +528,8 @@ out_unlock:
* If we dropped out via the lookup_address check under
* pgd_lock then stick the page back into the pool:
*/
- if (base) {
- list_add(&base->lru, &page_pool);
- pool_pages++;
- } else
- pool_used++;
+ if (base)
+ __free_page(base);
spin_unlock_irqrestore(&pgd_lock, flags);
return 0;
@@ -584,11 +537,16 @@ out_unlock:
static int __change_page_attr(struct cpa_data *cpa, int primary)
{
- unsigned long address = cpa->vaddr;
+ unsigned long address;
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ if (cpa->flags & CPA_ARRAY)
+ address = cpa->vaddr[cpa->curpage];
+ else
+ address = *cpa->vaddr;
+
repeat:
kpte = lookup_address(address, &level);
if (!kpte)
@@ -600,7 +558,7 @@ repeat:
return 0;
WARN(1, KERN_WARNING "CPA: called for zero pte. "
"vaddr = %lx cpa->vaddr = %lx\n", address,
- cpa->vaddr);
+ *cpa->vaddr);
return -EINVAL;
}
@@ -626,7 +584,7 @@ repeat:
*/
if (pte_val(old_pte) != pte_val(new_pte)) {
set_pte_atomic(kpte, new_pte);
- cpa->flushtlb = 1;
+ cpa->flags |= CPA_FLUSHTLB;
}
cpa->numpages = 1;
return 0;
@@ -650,7 +608,25 @@ repeat:
*/
err = split_large_page(kpte, address);
if (!err) {
- cpa->flushtlb = 1;
+ /*
+ * Do a global flush tlb after splitting the large page
+ * and before we do the actual change page attribute in the PTE.
+ *
+ * With out this, we violate the TLB application note, that says
+ * "The TLBs may contain both ordinary and large-page
+ * translations for a 4-KByte range of linear addresses. This
+ * may occur if software modifies the paging structures so that
+ * the page size used for the address range changes. If the two
+ * translations differ with respect to page frame or attributes
+ * (e.g., permissions), processor behavior is undefined and may
+ * be implementation-specific."
+ *
+ * We do this global tlb flush inside the cpa_lock, so that we
+ * don't allow any other cpu, with stale tlb entries change the
+ * page attribute in parallel, that also falls into the
+ * just split large page entry.
+ */
+ flush_tlb_all();
goto repeat;
}
@@ -663,6 +639,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
int ret = 0;
+ unsigned long temp_cpa_vaddr, vaddr;
if (cpa->pfn >= max_pfn_mapped)
return 0;
@@ -675,16 +652,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (!(within(cpa->vaddr, PAGE_OFFSET,
+ if (cpa->flags & CPA_ARRAY)
+ vaddr = cpa->vaddr[cpa->curpage];
+ else
+ vaddr = *cpa->vaddr;
+
+ if (!(within(vaddr, PAGE_OFFSET,
PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
#ifdef CONFIG_X86_64
- || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
+ || within(vaddr, PAGE_OFFSET + (1UL<<32),
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
#endif
)) {
alias_cpa = *cpa;
- alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+ temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~CPA_ARRAY;
+
ret = __change_page_attr_set_clr(&alias_cpa, 0);
}
@@ -696,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the high
* mapping already:
*/
- if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
+ if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
return 0;
/*
@@ -707,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
return 0;
alias_cpa = *cpa;
- alias_cpa.vaddr =
- (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
+ temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~CPA_ARRAY;
/*
* The high mapping range is imprecise, so ignore the return value.
@@ -728,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
* preservation check.
*/
cpa->numpages = numpages;
+ /* for array changes, we can't use large page */
+ if (cpa->flags & CPA_ARRAY)
+ cpa->numpages = 1;
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
ret = __change_page_attr(cpa, checkalias);
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
if (ret)
return ret;
@@ -746,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
*/
BUG_ON(cpa->numpages > numpages);
numpages -= cpa->numpages;
- cpa->vaddr += cpa->numpages * PAGE_SIZE;
+ if (cpa->flags & CPA_ARRAY)
+ cpa->curpage++;
+ else
+ *cpa->vaddr += cpa->numpages * PAGE_SIZE;
+
}
return 0;
}
@@ -757,9 +754,9 @@ static inline int cache_attr(pgprot_t attr)
(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
}
-static int change_page_attr_set_clr(unsigned long addr, int numpages,
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
- int force_split)
+ int force_split, int array)
{
struct cpa_data cpa;
int ret, cache, checkalias;
@@ -774,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
return 0;
/* Ensure we are PAGE_SIZE aligned */
- if (addr & ~PAGE_MASK) {
- addr &= PAGE_MASK;
- /*
- * People should not be passing in unaligned addresses:
- */
- WARN_ON_ONCE(1);
+ if (!array) {
+ if (*addr & ~PAGE_MASK) {
+ *addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
+ } else {
+ int i;
+ for (i = 0; i < numpages; i++) {
+ if (addr[i] & ~PAGE_MASK) {
+ addr[i] &= PAGE_MASK;
+ WARN_ON_ONCE(1);
+ }
+ }
}
+ /* Must avoid aliasing mappings in the highmem code */
+ kmap_flush_unused();
+
cpa.vaddr = addr;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
- cpa.flushtlb = 0;
+ cpa.flags = 0;
+ cpa.curpage = 0;
cpa.force_split = force_split;
+ if (array)
+ cpa.flags |= CPA_ARRAY;
+
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -797,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
/*
* Check whether we really changed something:
*/
- if (!cpa.flushtlb)
+ if (!(cpa.flags & CPA_FLUSHTLB))
goto out;
/*
@@ -812,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
* error case we fall back to cpa_flush_all (which uses
* wbindv):
*/
- if (!ret && cpu_has_clflush)
- cpa_flush_range(addr, numpages, cache);
- else
+ if (!ret && cpu_has_clflush) {
+ if (cpa.flags & CPA_ARRAY)
+ cpa_flush_array(addr, numpages, cache);
+ else
+ cpa_flush_range(*addr, numpages, cache);
+ } else
cpa_flush_all(cache);
out:
- cpa_fill_pool(NULL);
-
return ret;
}
-static inline int change_page_attr_set(unsigned long addr, int numpages,
- pgprot_t mask)
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
{
- return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
+ array);
}
-static inline int change_page_attr_clear(unsigned long addr, int numpages,
- pgprot_t mask)
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
{
- return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
+ array);
}
int _set_memory_uc(unsigned long addr, int numpages)
@@ -840,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
/*
* for now UC MINUS. see comments in ioremap_nocache()
*/
- return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_CACHE_UC_MINUS));
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_CACHE_UC_MINUS), 0);
}
int set_memory_uc(unsigned long addr, int numpages)
@@ -857,10 +874,48 @@ int set_memory_uc(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_uc);
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+ unsigned long start;
+ unsigned long end;
+ int i;
+ /*
+ * for now UC MINUS. see comments in ioremap_nocache()
+ */
+ for (i = 0; i < addrinarray; i++) {
+ start = __pa(addr[i]);
+ for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+ if (end != __pa(addr[i + 1]))
+ break;
+ i++;
+ }
+ if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+ goto out;
+ }
+
+ return change_page_attr_set(addr, addrinarray,
+ __pgprot(_PAGE_CACHE_UC_MINUS), 1);
+out:
+ for (i = 0; i < addrinarray; i++) {
+ unsigned long tmp = __pa(addr[i]);
+
+ if (tmp == start)
+ break;
+ for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+ if (end != __pa(addr[i + 1]))
+ break;
+ i++;
+ }
+ free_memtype(tmp, end);
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL(set_memory_array_uc);
+
int _set_memory_wc(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_CACHE_WC));
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_CACHE_WC), 0);
}
int set_memory_wc(unsigned long addr, int numpages)
@@ -878,8 +933,8 @@ EXPORT_SYMBOL(set_memory_wc);
int _set_memory_wb(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages,
- __pgprot(_PAGE_CACHE_MASK));
+ return change_page_attr_clear(&addr, numpages,
+ __pgprot(_PAGE_CACHE_MASK), 0);
}
int set_memory_wb(unsigned long addr, int numpages)
@@ -890,37 +945,59 @@ int set_memory_wb(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_wb);
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+ int i;
+
+ for (i = 0; i < addrinarray; i++) {
+ unsigned long start = __pa(addr[i]);
+ unsigned long end;
+
+ for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+ if (end != __pa(addr[i + 1]))
+ break;
+ i++;
+ }
+ free_memtype(start, end);
+ }
+ return change_page_attr_clear(addr, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK), 1);
+}
+EXPORT_SYMBOL(set_memory_array_wb);
+
int set_memory_x(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_x);
int set_memory_nx(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_nx);
int set_memory_ro(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
+EXPORT_SYMBOL_GPL(set_memory_ro);
int set_memory_rw(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
+EXPORT_SYMBOL_GPL(set_memory_rw);
int set_memory_np(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
int set_memory_4k(unsigned long addr, int numpages)
{
- return change_page_attr_set_clr(addr, numpages, __pgprot(0),
- __pgprot(0), 1);
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(0), 1, 0);
}
int set_pages_uc(struct page *page, int numpages)
@@ -973,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages)
static int __set_pages_p(struct page *page, int numpages)
{
- struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .mask_clr = __pgprot(0)};
+ .mask_clr = __pgprot(0),
+ .flags = 0};
- return __change_page_attr_set_clr(&cpa, 1);
+ /*
+ * No alias checking needed for setting present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
}
static int __set_pages_np(struct page *page, int numpages)
{
- struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .flags = 0};
- return __change_page_attr_set_clr(&cpa, 1);
+ /*
+ * No alias checking needed for setting not present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
}
void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1008,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
/*
* The return value is ignored as the calls cannot fail.
- * Large pages are kept enabled at boot time, and are
- * split up quickly with DEBUG_PAGEALLOC. If a splitup
- * fails here (due to temporary memory shortage) no damage
- * is done because we just keep the largepage intact up
- * to the next attempt when it will likely be split up:
+ * Large pages for identity mappings are not used at boot time
+ * and hence no memory allocations during large page split.
*/
if (enable)
__set_pages_p(page, numpages);
@@ -1024,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
* but that can deadlock->flush only current cpu:
*/
__flush_tlb_all();
-
- /*
- * Try to refill the page pool here. We can do this only after
- * the tlb flush.
- */
- cpa_fill_pool(NULL);
}
-#ifdef CONFIG_DEBUG_FS
-static int dpa_show(struct seq_file *m, void *v)
-{
- seq_puts(m, "DEBUG_PAGEALLOC\n");
- seq_printf(m, "pool_size : %lu\n", pool_size);
- seq_printf(m, "pool_pages : %lu\n", pool_pages);
- seq_printf(m, "pool_low : %lu\n", pool_low);
- seq_printf(m, "pool_used : %lu\n", pool_used);
- seq_printf(m, "pool_failed : %lu\n", pool_failed);
-
- return 0;
-}
-
-static int dpa_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, dpa_show, NULL);
-}
-
-static const struct file_operations dpa_fops = {
- .open = dpa_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int __init debug_pagealloc_proc_init(void)
-{
- struct dentry *de;
-
- de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
- &dpa_fops);
- if (!de)
- return -ENOMEM;
-
- return 0;
-}
-__initcall(debug_pagealloc_proc_init);
-#endif
-
#ifdef CONFIG_HIBERNATION
bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2a50e0fa64a..738fd0f2495 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,24 +7,24 @@
* Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
*/
-#include <linux/mm.h>
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
+#include <linux/mm.h>
#include <linux/fs.h>
-#include <linux/bootmem.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <asm/msr.h>
-#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
#include <asm/processor.h>
-#include <asm/page.h>
+#include <asm/tlbflush.h>
#include <asm/pgtable.h>
-#include <asm/pat.h>
-#include <asm/e820.h>
-#include <asm/cacheflush.h>
#include <asm/fcntl.h>
+#include <asm/e820.h>
#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
#include <asm/io.h>
#ifdef CONFIG_X86_PAT
@@ -46,6 +46,7 @@ early_param("nopat", nopat);
static int debug_enable;
+
static int __init pat_debug_setup(char *str)
{
debug_enable = 1;
@@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags)
*/
struct memtype {
- u64 start;
- u64 end;
- unsigned long type;
- struct list_head nd;
+ u64 start;
+ u64 end;
+ unsigned long type;
+ struct list_head nd;
};
static LIST_HEAD(memtype_list);
-static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
/*
* Does intersection of PAT memory type and MTRR memory type and returns
@@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
return req_type;
}
-static int chk_conflict(struct memtype *new, struct memtype *entry,
- unsigned long *type)
+static int
+chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
{
if (new->type != entry->type) {
if (type) {
@@ -211,6 +212,66 @@ static struct memtype *cached_entry;
static u64 cached_start;
/*
+ * For RAM pages, mark the pages as non WB memory type using
+ * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
+ * set_memory_wc() on a RAM page at a time before marking it as WB again.
+ * This is ok, because only one driver will be owning the page and
+ * doing set_memory_*() calls.
+ *
+ * For now, we use PageNonWB to track that the RAM page is being mapped
+ * as non WB. In future, we will have to use one more flag
+ * (or some other mechanism in page_struct) to distinguish between
+ * UC and WC mapping.
+ */
+static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
+ unsigned long *new_type)
+{
+ struct page *page;
+ u64 pfn, end_pfn;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ if (page_mapped(page) || PageNonWB(page))
+ goto out;
+
+ SetPageNonWB(page);
+ }
+ return 0;
+
+out:
+ end_pfn = pfn;
+ for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+ page = pfn_to_page(pfn);
+ ClearPageNonWB(page);
+ }
+
+ return -EINVAL;
+}
+
+static int free_ram_pages_type(u64 start, u64 end)
+{
+ struct page *page;
+ u64 pfn, end_pfn;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ if (page_mapped(page) || !PageNonWB(page))
+ goto out;
+
+ ClearPageNonWB(page);
+ }
+ return 0;
+
+out:
+ end_pfn = pfn;
+ for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+ page = pfn_to_page(pfn);
+ SetPageNonWB(page);
+ }
+ return -EINVAL;
+}
+
+/*
* req_type typically has one of the:
* - _PAGE_CACHE_WB
* - _PAGE_CACHE_WC
@@ -226,14 +287,15 @@ static u64 cached_start;
* it will return a negative return value.
*/
int reserve_memtype(u64 start, u64 end, unsigned long req_type,
- unsigned long *new_type)
+ unsigned long *new_type)
{
struct memtype *new, *entry;
unsigned long actual_type;
struct list_head *where;
+ int is_range_ram;
int err = 0;
- BUG_ON(start >= end); /* end is exclusive */
+ BUG_ON(start >= end); /* end is exclusive */
if (!pat_enabled) {
/* This is identical to page table setting without PAT */
@@ -266,17 +328,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
actual_type = _PAGE_CACHE_WB;
else
actual_type = _PAGE_CACHE_UC_MINUS;
- } else
+ } else {
actual_type = pat_x_mtrr_type(start, end,
req_type & _PAGE_CACHE_MASK);
+ }
+
+ is_range_ram = pagerange_is_ram(start, end);
+ if (is_range_ram == 1)
+ return reserve_ram_pages_type(start, end, req_type, new_type);
+ else if (is_range_ram < 0)
+ return -EINVAL;
new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
if (!new)
return -ENOMEM;
- new->start = start;
- new->end = end;
- new->type = actual_type;
+ new->start = start;
+ new->end = end;
+ new->type = actual_type;
if (new_type)
*new_type = actual_type;
@@ -335,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
start, end, cattr_name(new->type), cattr_name(req_type));
kfree(new);
spin_unlock(&memtype_lock);
+
return err;
}
@@ -358,6 +428,7 @@ int free_memtype(u64 start, u64 end)
{
struct memtype *entry;
int err = -EINVAL;
+ int is_range_ram;
if (!pat_enabled)
return 0;
@@ -366,6 +437,12 @@ int free_memtype(u64 start, u64 end)
if (is_ISA_range(start, end - 1))
return 0;
+ is_range_ram = pagerange_is_ram(start, end);
+ if (is_range_ram == 1)
+ return free_ram_pages_type(start, end);
+ else if (is_range_ram < 0)
+ return -EINVAL;
+
spin_lock(&memtype_lock);
list_for_each_entry(entry, &memtype_list, nd) {
if (entry->start == start && entry->end == end) {
@@ -386,6 +463,7 @@ int free_memtype(u64 start, u64 end)
}
dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+
return err;
}
@@ -492,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
{
+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
u64 addr = (u64)pfn << PAGE_SHIFT;
unsigned long flags;
- unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
reserve_memtype(addr, addr + size, want_flags, &flags);
if (flags != want_flags) {
@@ -514,7 +592,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
free_memtype(addr, addr + size);
}
-#if defined(CONFIG_DEBUG_FS)
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
/* get Nth element of the linked list */
static struct memtype *memtype_get_idx(loff_t pos)
@@ -537,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
}
spin_unlock(&memtype_lock);
kfree(print_entry);
+
return NULL;
}
@@ -567,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
print_entry->start, print_entry->end);
kfree(print_entry);
+
return 0;
}
@@ -598,4 +678,4 @@ static int __init pat_memtype_list_init(void)
late_initcall(pat_memtype_list_init);
-#endif /* CONFIG_DEBUG_FS */
+#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d50302774fe..86f2ffc43c3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(void *p)
+static void pgd_ctor(pgd_t *pgd)
{
- pgd_t *pgd = p;
-
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
@@ -87,7 +85,7 @@ static void pgd_ctor(void *p)
pgd_list_add(pgd);
}
-static void pgd_dtor(void *pgd)
+static void pgd_dtor(pgd_t *pgd)
{
unsigned long flags; /* can be called from interrupt context */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cab0abbd1eb..0951db9ee51 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -123,7 +123,8 @@ static int __init parse_vmalloc(char *arg)
if (!arg)
return -EINVAL;
- __VMALLOC_RESERVE = memparse(arg, &arg);
+ /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
+ __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
return 0;
}
early_param("vmalloc", parse_vmalloc);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 1b4763e26ea..51c0a2fc14f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
return;
}
- if (is_uv_system())
+ if (get_uv_system_type() >= UV_X2APIC)
apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
else
apic_id = pa->apic_id;