From 817a824b75b1475f1b067c8cee318c7b4d66fcde Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 26 Feb 2010 17:16:00 +0000 Subject: x86, xen: Disable highmem PTE allocation even when CONFIG_HIGHPTE=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a path in the pagefault code where the kernel deliberately breaks its own locking rules by kmapping a high pte page without holding the pagetable lock (in at least page_check_address). This breaks Xen's ability to track the pinned/unpinned state of the page. There does not appear to be a viable workaround for this behaviour so simply disable HIGHPTE for all Xen guests. Signed-off-by: Ian Campbell LKML-Reference: <1267204562-11844-1-git-send-email-ian.campbell@citrix.com> Cc: Jeremy Fitzhardinge Cc: Ingo Molnar Cc: Pasi Kärkkäinen Cc: # .32.x: 14315592: Allow highmem user page tables to be disabled at boot time Cc: # .32.x Cc: Signed-off-by: H. Peter Anvin --- arch/x86/xen/enlighten.c | 7 +++++++ arch/x86/xen/mmu.c | 11 ++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 36daccb6864..b607239c1ba 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -1094,6 +1095,12 @@ asmlinkage void __init xen_start_kernel(void) __supported_pte_mask |= _PAGE_IOMAP; + /* + * Prevent page tables from being allocated in highmem, even + * if CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + /* Work out if we support NX */ x86_configure_nx(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index bf4cd6bfe95..350a3deedf2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1432,14 +1432,15 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) { pgprot_t prot = PAGE_KERNEL; + /* + * We disable highmem allocations for page tables so we should never + * see any calls to kmap_atomic_pte on a highmem page. + */ + BUG_ON(PageHighMem(page)); + if (PagePinned(page)) prot = PAGE_KERNEL_RO; - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); - return kmap_atomic_prot(page, type, prot); } #endif -- cgit v1.2.3-18-g5258 From 3249b7e1df6380e9d7bb3238f64f445bf614f787 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 26 Feb 2010 17:16:01 +0000 Subject: x86, vmi: Disable highmem PTE allocation even when CONFIG_HIGHPTE=y Preventing HIGHPTE allocations under VMI will allow us to remove the kmap_atomic_pte paravirt op. Signed-off-by: Ian Campbell LKML-Reference: <1267204562-11844-2-git-send-email-ian.campbell@citrix.com> Acked-by: Alok Kataria Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmi_32.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index d430e4c3019..58aca86193e 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -272,19 +273,11 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) void *va = kmap_atomic(page, type); /* - * Internally, the VMI ROM must map virtual addresses to physical - * addresses for processing MMU updates. By the time MMU updates - * are issued, this information is typically already lost. - * Fortunately, the VMI provides a cache of mapping slots for active - * page tables. - * - * We use slot zero for the linear mapping of physical memory, and - * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1. - * - * args: SLOT VA COUNT PFN + * We disable highmem allocations for page tables so we should never + * see any calls to kmap_atomic_pte on a highmem page. */ - BUG_ON(type != KM_PTE0 && type != KM_PTE1); - vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); + + BUG_ON(PageHighmem(page)); return va; } @@ -640,6 +633,12 @@ static inline int __init activate_vmi(void) u64 reloc; const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + /* + * Prevent page tables from being allocated in highmem, even if + * CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + if (call_vrom_func(vmi_rom, vmi_init) != 0) { printk(KERN_ERR "VMI ROM failed to initialize!"); return 0; -- cgit v1.2.3-18-g5258 From dad52fc01161afcb8798c609e009aed4d104927f Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 26 Feb 2010 17:16:02 +0000 Subject: x86, paravirt: Remove kmap_atomic_pte paravirt op. Now that both Xen and VMI disable allocations of PTE pages from high memory this paravirt op serves no further purpose. This effectively reverts ce6234b5 "add kmap_atomic_pte for mapping highpte pages". Signed-off-by: Ian Campbell LKML-Reference: <1267204562-11844-3-git-send-email-ian.campbell@citrix.com> Acked-by: Alok Kataria Cc: Jeremy Fitzhardinge Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/highmem.h | 4 ---- arch/x86/include/asm/paravirt.h | 9 --------- arch/x86/include/asm/paravirt_types.h | 4 ---- arch/x86/include/asm/pgtable_32.h | 4 ++-- arch/x86/kernel/paravirt.c | 4 ---- arch/x86/kernel/vmi_32.c | 20 -------------------- arch/x86/xen/mmu.c | 22 ---------------------- 7 files changed, 2 insertions(+), 65 deletions(-) diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 014c2b85ae4..a726650fc80 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); -#ifndef CONFIG_PARAVIRT -#define kmap_atomic_pte(page, type) kmap_atomic(page, type) -#endif - #define flush_cache_kmaps() do { } while (0) extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index dd59a85a918..5653f43d90e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn) PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); } -#ifdef CONFIG_HIGHPTE -static inline void *kmap_atomic_pte(struct page *page, enum km_type type) -{ - unsigned long ret; - ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); - return (void *)ret; -} -#endif - static inline void pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b1e70d51e40..db9ef553234 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -304,10 +304,6 @@ struct pv_mmu_ops { #endif /* PAGETABLE_LEVELS == 4 */ #endif /* PAGETABLE_LEVELS >= 3 */ -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - struct pv_lazy_ops lazy_mode; /* dom0 ops */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 01fd9461d32..b422d2201af 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -54,10 +54,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); in_irq() ? KM_IRQ_PTE : \ KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ pte_index((address))) #define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \ pte_index((address))) #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b1739d1631..1db183ed7c0 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = kmap_atomic, -#endif - #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 58aca86193e..7dd599deca4 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -267,22 +267,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_HIGHPTE -static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) -{ - void *va = kmap_atomic(page, type); - - /* - * We disable highmem allocations for page tables so we should never - * see any calls to kmap_atomic_pte on a highmem page. - */ - - BUG_ON(PageHighmem(page)); - - return va; -} -#endif - static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); @@ -777,10 +761,6 @@ static inline int __init activate_vmi(void) /* Set linear is needed in all cases */ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); -#ifdef CONFIG_HIGHPTE - if (vmi_ops.set_linear_mapping) - pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; -#endif /* * These MUST always be patched. Don't support indirect jumps diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 350a3deedf2..f9eb7de74f4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1427,24 +1427,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) -{ - pgprot_t prot = PAGE_KERNEL; - - /* - * We disable highmem allocations for page tables so we should never - * see any calls to kmap_atomic_pte on a highmem page. - */ - BUG_ON(PageHighMem(page)); - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - return kmap_atomic_prot(page, type, prot); -} -#endif - #ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { @@ -1903,10 +1885,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .alloc_pmd_clone = paravirt_nop, .release_pmd = xen_release_pmd_init, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - #ifdef CONFIG_X86_64 .set_pte = xen_set_pte, #else -- cgit v1.2.3-18-g5258 From 37b99dd5372cff42f83210c280f314f10f99138e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 1 Mar 2010 21:55:51 +0800 Subject: resource: Fix generic page_is_ram() for partial RAM pages The System RAM walk shall skip partial RAM pages and avoid calling func() on them. So that page_is_ram() return 0 for a partial RAM page. In particular, it shall not call func() with len=0. This fixes a boot time bug reported by Sachin and root caused by Thomas: > >>> WARNING: at arch/x86/mm/ioremap.c:111 __ioremap_caller+0x169/0x2f1() > >>> Hardware name: BladeCenter LS21 -[79716AA]- > >>> Modules linked in: > >>> Pid: 0, comm: swapper Not tainted 2.6.33-git6-autotest #1 > >>> Call Trace: > >>> [] ? __ioremap_caller+0x169/0x2f1 > >>> [] warn_slowpath_common+0x77/0xa4 > >>> [] warn_slowpath_null+0xf/0x11 > >>> [] __ioremap_caller+0x169/0x2f1 > >>> [] ? acpi_os_map_memory+0x12/0x1b > >>> [] ioremap_nocache+0x12/0x14 > >>> [] acpi_os_map_memory+0x12/0x1b > >>> [] acpi_tb_verify_table+0x29/0x5b > >>> [] acpi_load_tables+0x39/0x15a > >>> [] acpi_early_init+0x60/0xf5 > >>> [] start_kernel+0x397/0x3a7 > >>> [] x86_64_start_reservations+0xa5/0xa9 > >>> [] x86_64_start_kernel+0xe1/0xe8 > >>> ---[ end trace 4eaa2a86a8e2da22 ]--- > >>> ioremap reserve_memtype failed -22 The return code is -EINVAL, so it failed in the is_ram check, which is not too surprising > BIOS-provided physical RAM map: > BIOS-e820: 0000000000000000 - 000000000009c000 (usable) > BIOS-e820: 000000000009c000 - 00000000000a0000 (reserved) > BIOS-e820: 00000000000e0000 - 0000000000100000 (reserved) > BIOS-e820: 0000000000100000 - 00000000cffa3900 (usable) > BIOS-e820: 00000000cffa3900 - 00000000cffa7400 (ACPI data) The ACPI data is not starting on a page boundary and neither does the usable RAM area end on a page boundary. Very useful ! > ACPI: DSDT 00000000cffa3900 036CE (v01 IBM SERLEWIS 00001000 INTL 20060912) ACPI is trying to map DSDT at cffa3900, which results in a check vs. cffa3000 which is the relevant page boundary. The generic is_ram check correctly identifies that as RAM because it's in the usable resource area. The old e820 based is_ram check does not take overlapping resource areas into account. That's why it works. CC: Sachin Sant CC: Thomas Gleixner CC: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang LKML-Reference: <20100301135551.GA9998@localhost> Signed-off-by: H. Peter Anvin --- kernel/resource.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 03c897f7935..8f0e3d0f4bf 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -274,7 +274,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) { struct resource res; - unsigned long pfn, len; + unsigned long pfn, end_pfn; u64 orig_end; int ret = -1; @@ -284,9 +284,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, orig_end = res.end; while ((res.start < res.end) && (find_next_system_ram(&res, "System RAM") >= 0)) { - pfn = (unsigned long)(res.start >> PAGE_SHIFT); - len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); - ret = (*func)(pfn, len, arg); + pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; + end_pfn = (res.end + 1) >> PAGE_SHIFT; + if (end_pfn > pfn) + ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) break; res.start = res.end + 1; -- cgit v1.2.3-18-g5258 From f41496607e03ab99f263b8e26689ad0fc853007f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 2 Mar 2010 11:21:09 -0800 Subject: resource: Fix broken indentation Fix broken indentation in patch 37b99dd5372cff42f83210c280f314f10f99138e. Reported-by: Linus Torvalds Cc: Wu Fengguang LKML-Reference: <20100301135551.GA9998@localhost> Signed-off-by: H. Peter Anvin --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/resource.c b/kernel/resource.c index 8f0e3d0f4bf..91f430fd467 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -287,7 +287,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) - ret = (*func)(pfn, end_pfn - pfn, arg); + ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) break; res.start = res.end + 1; -- cgit v1.2.3-18-g5258