diff options
Diffstat (limited to 'arch/powerpc/mm')
45 files changed, 11231 insertions, 3222 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index 3899ea97fbd..5810967511d 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -35,6 +35,7 @@ #include <linux/init.h> #include <linux/delay.h> #include <linux/highmem.h> +#include <linux/memblock.h> #include <asm/pgalloc.h> #include <asm/prom.h> @@ -47,6 +48,7 @@ #include <asm/bootx.h> #include <asm/machdep.h> #include <asm/setup.h> + #include "mmu_decl.h" extern int __map_without_ltlbs; @@ -84,20 +86,20 @@ void __init MMU_init_hw(void) * vectors and the kernel live in real-mode. */ - mtspr(SPRN_DCCR, 0xF0000000); /* 512 MB of data space at 0x0. */ - mtspr(SPRN_ICCR, 0xF0000000); /* 512 MB of instr. space at 0x0. */ + mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */ + mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */ } #define LARGE_PAGE_SIZE_16M (1<<24) #define LARGE_PAGE_SIZE_4M (1<<22) -unsigned long __init mmu_mapin_ram(void) +unsigned long __init mmu_mapin_ram(unsigned long top) { - unsigned long v, s; + unsigned long v, s, mapped; phys_addr_t p; v = KERNELBASE; - p = PPC_MEMSTART; + p = 0; s = total_lowmem; if (__map_without_ltlbs) @@ -105,7 +107,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_16M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp++) = val; @@ -120,7 +122,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_4M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp) = val; @@ -130,5 +132,28 @@ unsigned long __init mmu_mapin_ram(void) s -= LARGE_PAGE_SIZE_4M; } - return total_lowmem - s; + mapped = total_lowmem - s; + + /* If the size of RAM is not an exact power of two, we may not + * have covered RAM in its entirety with 16 and 4 MiB + * pages. Consequently, restrict the top end of RAM currently + * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail" + * coverage with normal-sized pages (or other reasons) do not + * attempt to allocate outside the allowed range. + */ + memblock_set_current_limit(mapped); + + return mapped; +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* 40x can only access 16MB at the moment (see head_40x.S) */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000)); } diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c index 04dc08798d3..82b1ff759e2 100644 --- a/arch/powerpc/mm/44x_mmu.c +++ b/arch/powerpc/mm/44x_mmu.c @@ -24,9 +24,11 @@ */ #include <linux/init.h> +#include <linux/memblock.h> + #include <asm/mmu.h> -#include <asm/system.h> #include <asm/page.h> +#include <asm/cacheflush.h> #include "mmu_decl.h" @@ -37,11 +39,39 @@ unsigned int tlb_44x_index; /* = 0 */ unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS; int icache_44x_need_flush; +unsigned long tlb_47x_boltmap[1024/8]; + +static void ppc44x_update_tlb_hwater(void) +{ + extern unsigned int tlb_44x_patch_hwater_D[]; + extern unsigned int tlb_44x_patch_hwater_I[]; + + /* The TLB miss handlers hard codes the watermark in a cmpli + * instruction to improve performances rather than loading it + * from the global variable. Thus, we patch the instructions + * in the 2 TLB miss handlers when updating the value + */ + tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) | + tlb_44x_hwater; + flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0], + (unsigned long)&tlb_44x_patch_hwater_D[1]); + tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) | + tlb_44x_hwater; + flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0], + (unsigned long)&tlb_44x_patch_hwater_I[1]); +} + /* - * "Pins" a 256MB TLB entry in AS0 for kernel lowmem + * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 44x type MMU */ static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys) { + unsigned int entry = tlb_44x_hwater--; + + ppc44x_update_tlb_hwater(); + + mtspr(SPRN_MMUCR, 0); + __asm__ __volatile__( "tlbwe %2,%3,%4\n" "tlbwe %1,%3,%5\n" @@ -50,26 +80,175 @@ static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys) : "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G), "r" (phys), "r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M), - "r" (tlb_44x_hwater--), /* slot for this TLB entry */ + "r" (entry), "i" (PPC44x_TLB_PAGEID), "i" (PPC44x_TLB_XLAT), "i" (PPC44x_TLB_ATTRIB)); } +static int __init ppc47x_find_free_bolted(void) +{ + unsigned int mmube0 = mfspr(SPRN_MMUBE0); + unsigned int mmube1 = mfspr(SPRN_MMUBE1); + + if (!(mmube0 & MMUBE0_VBE0)) + return 0; + if (!(mmube0 & MMUBE0_VBE1)) + return 1; + if (!(mmube0 & MMUBE0_VBE2)) + return 2; + if (!(mmube1 & MMUBE1_VBE3)) + return 3; + if (!(mmube1 & MMUBE1_VBE4)) + return 4; + if (!(mmube1 & MMUBE1_VBE5)) + return 5; + return -1; +} + +static void __init ppc47x_update_boltmap(void) +{ + unsigned int mmube0 = mfspr(SPRN_MMUBE0); + unsigned int mmube1 = mfspr(SPRN_MMUBE1); + + if (mmube0 & MMUBE0_VBE0) + __set_bit((mmube0 >> MMUBE0_IBE0_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube0 & MMUBE0_VBE1) + __set_bit((mmube0 >> MMUBE0_IBE1_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube0 & MMUBE0_VBE2) + __set_bit((mmube0 >> MMUBE0_IBE2_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube1 & MMUBE1_VBE3) + __set_bit((mmube1 >> MMUBE1_IBE3_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube1 & MMUBE1_VBE4) + __set_bit((mmube1 >> MMUBE1_IBE4_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube1 & MMUBE1_VBE5) + __set_bit((mmube1 >> MMUBE1_IBE5_SHIFT) & 0xff, + tlb_47x_boltmap); +} + +/* + * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU + */ +static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys) +{ + unsigned int rA; + int bolted; + + /* Base rA is HW way select, way 0, bolted bit set */ + rA = 0x88000000; + + /* Look for a bolted entry slot */ + bolted = ppc47x_find_free_bolted(); + BUG_ON(bolted < 0); + + /* Insert bolted slot number */ + rA |= bolted << 24; + + pr_debug("256M TLB entry for 0x%08x->0x%08x in bolt slot %d\n", + virt, phys, bolted); + + mtspr(SPRN_MMUCR, 0); + + __asm__ __volatile__( + "tlbwe %2,%3,0\n" + "tlbwe %1,%3,1\n" + "tlbwe %0,%3,2\n" + : + : "r" (PPC47x_TLB2_SW | PPC47x_TLB2_SR | + PPC47x_TLB2_SX +#ifdef CONFIG_SMP + | PPC47x_TLB2_M +#endif + ), + "r" (phys), + "r" (virt | PPC47x_TLB0_VALID | PPC47x_TLB0_256M), + "r" (rA)); +} + void __init MMU_init_hw(void) { + /* This is not useful on 47x but won't hurt either */ + ppc44x_update_tlb_hwater(); + flush_instruction_cache(); } -unsigned long __init mmu_mapin_ram(void) +unsigned long __init mmu_mapin_ram(unsigned long top) { unsigned long addr; + unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); /* Pin in enough TLBs to cover any lowmem not covered by the * initial 256M mapping established in head_44x.S */ - for (addr = PPC_PIN_SIZE; addr < total_lowmem; - addr += PPC_PIN_SIZE) - ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); + for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr; + addr += PPC_PIN_SIZE) { + if (mmu_has_feature(MMU_FTR_TYPE_47x)) + ppc47x_pin_tlb(addr + PAGE_OFFSET, addr); + else + ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); + } + if (mmu_has_feature(MMU_FTR_TYPE_47x)) { + ppc47x_update_boltmap(); +#ifdef DEBUG + { + int i; + + printk(KERN_DEBUG "bolted entries: "); + for (i = 0; i < 255; i++) { + if (test_bit(i, tlb_47x_boltmap)) + printk("%d ", i); + } + printk("\n"); + } +#endif /* DEBUG */ + } return total_lowmem; } + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + u64 size; + +#ifndef CONFIG_NONSTATIC_KERNEL + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); +#endif + + /* 44x has a 256M TLB entry pinned at boot */ + size = (min_t(u64, first_memblock_size, PPC_PIN_SIZE)); + memblock_set_current_limit(first_memblock_base + size); +} + +#ifdef CONFIG_SMP +void mmu_init_secondary(int cpu) +{ + unsigned long addr; + unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); + + /* Pin in enough TLBs to cover any lowmem not covered by the + * initial 256M mapping established in head_44x.S + * + * WARNING: This is called with only the first 256M of the + * linear mapping in the TLB and we can't take faults yet + * so beware of what this code uses. It runs off a temporary + * stack. current (r2) isn't initialized, smp_processor_id() + * will not work, current thread info isn't accessible, ... + */ + for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr; + addr += PPC_PIN_SIZE) { + if (mmu_has_feature(MMU_FTR_TYPE_47x)) + ppc47x_pin_tlb(addr + PAGE_OFFSET, addr); + else + ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); + } +} +#endif /* CONFIG_SMP */ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 41649a5d360..51230ee6a40 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -2,24 +2,37 @@ # Makefile for the linux ppc-specific parts of the memory manager. # -ifeq ($(CONFIG_PPC64),y) -EXTRA_CFLAGS += -mno-minimal-toc -endif +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -obj-y := fault.o mem.o lmb.o \ +obj-y := fault.o mem.o pgtable.o gup.o mmap.o \ init_$(CONFIG_WORD_SIZE).o \ - pgtable_$(CONFIG_WORD_SIZE).o \ - mmu_context_$(CONFIG_WORD_SIZE).o -hash-$(CONFIG_PPC_NATIVE) := hash_native_64.o -obj-$(CONFIG_PPC64) += hash_utils_64.o \ - slb_low.o slb.o stab.o mmap.o $(hash-y) + pgtable_$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ + tlb_nohash_low.o +obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o +hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o +obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ + slb_low.o slb.o stab.o \ + $(hash64-y) obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ - tlb_$(CONFIG_WORD_SIZE).o + tlb_hash$(CONFIG_WORD_SIZE).o \ + mmu_context_hash$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC_ICSWX) += icswx.o +obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o obj-$(CONFIG_40x) += 40x_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o -obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o +obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-y += hugetlbpage.o +ifeq ($(CONFIG_HUGETLB_PAGE),y) +obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o +endif +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o +obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o +obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c new file mode 100644 index 00000000000..7b6c1075017 --- /dev/null +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -0,0 +1,419 @@ +/* + * PowerPC version derived from arch/arm/mm/consistent.c + * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) + * + * Copyright (C) 2000 Russell King + * + * Consistent memory allocators. Used for DMA devices that want to + * share uncached memory with the processor core. The function return + * is the virtual address and 'dma_handle' is the physical address. + * Mostly stolen from the ARM port, with some changes for PowerPC. + * -- Dan + * + * Reorganized to get rid of the arch-specific consistent_* functions + * and provide non-coherent implementations for the DMA API. -Matt + * + * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent() + * implementation. This is pulled straight from ARM and barely + * modified. -Matt + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/highmem.h> +#include <linux/dma-mapping.h> +#include <linux/export.h> + +#include <asm/tlbflush.h> + +#include "mmu_decl.h" + +/* + * This address range defaults to a value that is safe for all + * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It + * can be further configured for specific applications under + * the "Advanced Setup" menu. -Matt + */ +#define CONSISTENT_BASE (IOREMAP_TOP) +#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE) +#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) + +/* + * This is the page table (2MB) covering uncached, DMA consistent allocations + */ +static DEFINE_SPINLOCK(consistent_lock); + +/* + * VM region handling support. + * + * This should become something generic, handling VM region allocations for + * vmalloc and similar (ioremap, module space, etc). + * + * I envisage vmalloc()'s supporting vm_struct becoming: + * + * struct vm_struct { + * struct vm_region region; + * unsigned long flags; + * struct page **pages; + * unsigned int nr_pages; + * unsigned long phys_addr; + * }; + * + * get_vm_area() would then call vm_region_alloc with an appropriate + * struct vm_region head (eg): + * + * struct vm_region vmalloc_head = { + * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), + * .vm_start = VMALLOC_START, + * .vm_end = VMALLOC_END, + * }; + * + * However, vmalloc_head.vm_start is variable (typically, it is dependent on + * the amount of RAM found at boot time.) I would imagine that get_vm_area() + * would have to initialise this each time prior to calling vm_region_alloc(). + */ +struct ppc_vm_region { + struct list_head vm_list; + unsigned long vm_start; + unsigned long vm_end; +}; + +static struct ppc_vm_region consistent_head = { + .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), + .vm_start = CONSISTENT_BASE, + .vm_end = CONSISTENT_END, +}; + +static struct ppc_vm_region * +ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp) +{ + unsigned long addr = head->vm_start, end = head->vm_end - size; + unsigned long flags; + struct ppc_vm_region *c, *new; + + new = kmalloc(sizeof(struct ppc_vm_region), gfp); + if (!new) + goto out; + + spin_lock_irqsave(&consistent_lock, flags); + + list_for_each_entry(c, &head->vm_list, vm_list) { + if ((addr + size) < addr) + goto nospc; + if ((addr + size) <= c->vm_start) + goto found; + addr = c->vm_end; + if (addr > end) + goto nospc; + } + + found: + /* + * Insert this entry _before_ the one we found. + */ + list_add_tail(&new->vm_list, &c->vm_list); + new->vm_start = addr; + new->vm_end = addr + size; + + spin_unlock_irqrestore(&consistent_lock, flags); + return new; + + nospc: + spin_unlock_irqrestore(&consistent_lock, flags); + kfree(new); + out: + return NULL; +} + +static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr) +{ + struct ppc_vm_region *c; + + list_for_each_entry(c, &head->vm_list, vm_list) { + if (c->vm_start == addr) + goto out; + } + c = NULL; + out: + return c; +} + +/* + * Allocate DMA-coherent memory space and return both the kernel remapped + * virtual and bus address for that space. + */ +void * +__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) +{ + struct page *page; + struct ppc_vm_region *c; + unsigned long order; + u64 mask = ISA_DMA_THRESHOLD, limit; + + if (dev) { + mask = dev->coherent_dma_mask; + + /* + * Sanity check the DMA mask - it must be non-zero, and + * must be able to be satisfied by a DMA allocation. + */ + if (mask == 0) { + dev_warn(dev, "coherent DMA mask is unset\n"); + goto no_page; + } + + if ((~mask) & ISA_DMA_THRESHOLD) { + dev_warn(dev, "coherent DMA mask %#llx is smaller " + "than system GFP_DMA mask %#llx\n", + mask, (unsigned long long)ISA_DMA_THRESHOLD); + goto no_page; + } + } + + + size = PAGE_ALIGN(size); + limit = (mask + 1) & ~mask; + if ((limit && size >= limit) || + size >= (CONSISTENT_END - CONSISTENT_BASE)) { + printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n", + size, mask); + return NULL; + } + + order = get_order(size); + + /* Might be useful if we ever have a real legacy DMA zone... */ + if (mask != 0xffffffff) + gfp |= GFP_DMA; + + page = alloc_pages(gfp, order); + if (!page) + goto no_page; + + /* + * Invalidate any data that might be lurking in the + * kernel direct-mapped region for device DMA. + */ + { + unsigned long kaddr = (unsigned long)page_address(page); + memset(page_address(page), 0, size); + flush_dcache_range(kaddr, kaddr + size); + } + + /* + * Allocate a virtual address in the consistent mapping region. + */ + c = ppc_vm_region_alloc(&consistent_head, size, + gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); + if (c) { + unsigned long vaddr = c->vm_start; + struct page *end = page + (1 << order); + + split_page(page, order); + + /* + * Set the "dma handle" + */ + *handle = page_to_phys(page); + + do { + SetPageReserved(page); + map_page(vaddr, page_to_phys(page), + pgprot_noncached(PAGE_KERNEL)); + page++; + vaddr += PAGE_SIZE; + } while (size -= PAGE_SIZE); + + /* + * Free the otherwise unused pages. + */ + while (page < end) { + __free_page(page); + page++; + } + + return (void *)c->vm_start; + } + + if (page) + __free_pages(page, order); + no_page: + return NULL; +} +EXPORT_SYMBOL(__dma_alloc_coherent); + +/* + * free a page as defined by the above mapping. + */ +void __dma_free_coherent(size_t size, void *vaddr) +{ + struct ppc_vm_region *c; + unsigned long flags, addr; + + size = PAGE_ALIGN(size); + + spin_lock_irqsave(&consistent_lock, flags); + + c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr); + if (!c) + goto no_area; + + if ((c->vm_end - c->vm_start) != size) { + printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", + __func__, c->vm_end - c->vm_start, size); + dump_stack(); + size = c->vm_end - c->vm_start; + } + + addr = c->vm_start; + do { + pte_t *ptep; + unsigned long pfn; + + ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr), + addr), + addr), + addr); + if (!pte_none(*ptep) && pte_present(*ptep)) { + pfn = pte_pfn(*ptep); + pte_clear(&init_mm, addr, ptep); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + __free_reserved_page(page); + } + } + addr += PAGE_SIZE; + } while (size -= PAGE_SIZE); + + flush_tlb_kernel_range(c->vm_start, c->vm_end); + + list_del(&c->vm_list); + + spin_unlock_irqrestore(&consistent_lock, flags); + + kfree(c); + return; + + no_area: + spin_unlock_irqrestore(&consistent_lock, flags); + printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", + __func__, vaddr); + dump_stack(); +} +EXPORT_SYMBOL(__dma_free_coherent); + +/* + * make an area consistent. + */ +void __dma_sync(void *vaddr, size_t size, int direction) +{ + unsigned long start = (unsigned long)vaddr; + unsigned long end = start + size; + + switch (direction) { + case DMA_NONE: + BUG(); + case DMA_FROM_DEVICE: + /* + * invalidate only when cache-line aligned otherwise there is + * the potential for discarding uncommitted data from the cache + */ + if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1))) + flush_dcache_range(start, end); + else + invalidate_dcache_range(start, end); + break; + case DMA_TO_DEVICE: /* writeback only */ + clean_dcache_range(start, end); + break; + case DMA_BIDIRECTIONAL: /* writeback and invalidate */ + flush_dcache_range(start, end); + break; + } +} +EXPORT_SYMBOL(__dma_sync); + +#ifdef CONFIG_HIGHMEM +/* + * __dma_sync_page() implementation for systems using highmem. + * In this case, each page of a buffer must be kmapped/kunmapped + * in order to have a virtual address for __dma_sync(). This must + * not sleep so kmap_atomic()/kunmap_atomic() are used. + * + * Note: yes, it is possible and correct to have a buffer extend + * beyond the first page. + */ +static inline void __dma_sync_page_highmem(struct page *page, + unsigned long offset, size_t size, int direction) +{ + size_t seg_size = min((size_t)(PAGE_SIZE - offset), size); + size_t cur_size = seg_size; + unsigned long flags, start, seg_offset = offset; + int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE; + int seg_nr = 0; + + local_irq_save(flags); + + do { + start = (unsigned long)kmap_atomic(page + seg_nr) + seg_offset; + + /* Sync this buffer segment */ + __dma_sync((void *)start, seg_size, direction); + kunmap_atomic((void *)start); + seg_nr++; + + /* Calculate next buffer segment size */ + seg_size = min((size_t)PAGE_SIZE, size - cur_size); + + /* Add the segment size to our running total */ + cur_size += seg_size; + seg_offset = 0; + } while (seg_nr < nr_segs); + + local_irq_restore(flags); +} +#endif /* CONFIG_HIGHMEM */ + +/* + * __dma_sync_page makes memory consistent. identical to __dma_sync, but + * takes a struct page instead of a virtual address + */ +void __dma_sync_page(struct page *page, unsigned long offset, + size_t size, int direction) +{ +#ifdef CONFIG_HIGHMEM + __dma_sync_page_highmem(page, offset, size, direction); +#else + unsigned long start = (unsigned long)page_address(page) + offset; + __dma_sync((void *)start, size, direction); +#endif +} +EXPORT_SYMBOL(__dma_sync_page); + +/* + * Return the PFN for a given cpu virtual address returned by + * __dma_alloc_coherent. This is used by dma_mmap_coherent() + */ +unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr) +{ + /* This should always be populated, so we don't test every + * level. If that fails, we'll have a nice crash which + * will be as good as a BUG_ON() + */ + pgd_t *pgd = pgd_offset_k(cpu_addr); + pud_t *pud = pud_offset(pgd, cpu_addr); + pmd_t *pmd = pmd_offset(pud, cpu_addr); + pte_t *ptep = pte_offset_kernel(pmd, cpu_addr); + + if (pte_none(*ptep) || !pte_present(*ptep)) + return 0; + return pte_pfn(*ptep); +} diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 7b251079926..51ab9e7e6c3 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -29,16 +29,23 @@ #include <linux/module.h> #include <linux/kprobes.h> #include <linux/kdebug.h> +#include <linux/perf_event.h> +#include <linux/magic.h> +#include <linux/ratelimit.h> +#include <linux/context_tracking.h> +#include <asm/firmware.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/siginfo.h> +#include <asm/debug.h> +#include <mm/mmu_decl.h> +#include "icswx.h" #ifdef CONFIG_KPROBES static inline int notify_page_fault(struct pt_regs *regs) @@ -99,31 +106,80 @@ static int store_updates_sp(struct pt_regs *regs) } return 0; } +/* + * do_page_fault error handling helpers + */ -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) -static void do_dabr(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +#define MM_FAULT_RETURN 0 +#define MM_FAULT_CONTINUE -1 +#define MM_FAULT_ERR(sig) (sig) + +static int do_sigbus(struct pt_regs *regs, unsigned long address) { siginfo_t info; - if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, - 11, SIGSEGV) == NOTIFY_STOP) - return; + up_read(¤t->mm->mmap_sem); - if (debugger_dabr_match(regs)) - return; + if (user_mode(regs)) { + current->thread.trap_nr = BUS_ADRERR; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, current); + return MM_FAULT_RETURN; + } + return MM_FAULT_ERR(SIGBUS); +} + +static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) +{ + /* + * Pagefault was interrupted by SIGKILL. We have no reason to + * continue the pagefault. + */ + if (fatal_signal_pending(current)) { + /* + * If we have retry set, the mmap semaphore will have + * alrady been released in __lock_page_or_retry(). Else + * we release it now. + */ + if (!(fault & VM_FAULT_RETRY)) + up_read(¤t->mm->mmap_sem); + /* Coming from kernel, we need to deal with uaccess fixups */ + if (user_mode(regs)) + return MM_FAULT_RETURN; + return MM_FAULT_ERR(SIGKILL); + } + + /* No fault: be happy */ + if (!(fault & VM_FAULT_ERROR)) + return MM_FAULT_CONTINUE; + + /* Out of memory */ + if (fault & VM_FAULT_OOM) { + up_read(¤t->mm->mmap_sem); - /* Clear the DABR */ - set_dabr(0); + /* + * We ran out of memory, or some other thing happened to us that + * made us unable to handle the page fault gracefully. + */ + if (!user_mode(regs)) + return MM_FAULT_ERR(SIGKILL); + pagefault_out_of_memory(); + return MM_FAULT_RETURN; + } + + /* Bus error. x86 handles HWPOISON here, we'll add this if/when + * we support the feature in HW + */ + if (fault & VM_FAULT_SIGBUS) + return do_sigbus(regs, addr); - /* Deliver the signal to userspace */ - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)address; - force_sig_info(SIGTRAP, &info, current); + /* We don't understand the fault code, this is fatal */ + BUG(); + return MM_FAULT_CONTINUE; } -#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ /* * For 600- and 800-family processors, the error_code parameter is DSISR @@ -141,13 +197,16 @@ static void do_dabr(struct pt_regs *regs, unsigned long address, int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code) { + enum ctx_state prev_state = exception_enter(); struct vm_area_struct * vma; struct mm_struct *mm = current->mm; - siginfo_t info; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; int code = SEGV_MAPERR; - int is_write = 0, ret; + int is_write = 0; int trap = TRAP(regs); int is_exec = trap == 0x400; + int fault; + int rc = 0, store_update_sp = 0; #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) /* @@ -164,27 +223,49 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, is_write = error_code & ESR_DST; #endif /* CONFIG_4xx || CONFIG_BOOKE */ +#ifdef CONFIG_PPC_ICSWX + /* + * we need to do this early because this "data storage + * interrupt" does not update the DAR/DEAR so we don't want to + * look at it + */ + if (error_code & ICSWX_DSI_UCT) { + rc = acop_handle_fault(regs, address, error_code); + if (rc) + goto bail; + } +#endif /* CONFIG_PPC_ICSWX */ + if (notify_page_fault(regs)) - return 0; + goto bail; if (unlikely(debugger_fault_handler(regs))) - return 0; + goto bail; /* On a kernel SLB miss we can only check for a valid exception entry */ - if (!user_mode(regs) && (address >= TASK_SIZE)) - return SIGSEGV; + if (!user_mode(regs) && (address >= TASK_SIZE)) { + rc = SIGSEGV; + goto bail; + } -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) +#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \ + defined(CONFIG_PPC_BOOK3S_64)) if (error_code & DSISR_DABRMATCH) { - /* DABR match */ - do_dabr(regs, address, error_code); - return 0; + /* breakpoint match */ + do_break(regs, address, error_code); + goto bail; } -#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ +#endif + + /* We restore the interrupt state now */ + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); if (in_atomic() || mm == NULL) { - if (!user_mode(regs)) - return SIGSEGV; + if (!user_mode(regs)) { + rc = SIGSEGV; + goto bail; + } /* in_atomic() in user mode is really bad, as is current->mm == NULL. */ printk(KERN_EMERG "Page fault in user mode with " @@ -194,6 +275,19 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + /* + * We want to do this outside mmap_sem, because reading code around nip + * can result in fault, which will cause a deadlock when called with + * mmap_sem held + */ + if (user_mode(regs)) + store_update_sp = store_updates_sp(regs); + + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -213,7 +307,15 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, if (!user_mode(regs) && !search_exception_tables(regs->nip)) goto bad_area_nosemaphore; +retry: down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): + */ + might_sleep(); } vma = find_vma(mm, address); @@ -251,8 +353,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, * between the last mapped region and the stack will * expand the stack rather than segfaulting. */ - if (address + 2048 < uregs->gpr[1] - && (!user_mode(regs) || !store_updates_sp(regs))) + if (address + 2048 < uregs->gpr[1] && !store_update_sp) goto bad_area; } if (expand_stack(vma, address)) @@ -267,6 +368,12 @@ good_area: goto bad_area; #endif /* CONFIG_6xx */ #if defined(CONFIG_8xx) + /* 8xx sometimes need to load a invalid/non-present TLBs. + * These must be invalidated separately as linux mm don't. + */ + if (error_code & 0x40000000) /* no translation? */ + _tlbil_va(address, 0, 0, 0); + /* The MPC8xx seems to always set 0x80000000, which is * "undefined". Of those that can be set, this is the only * one which seems bad. @@ -277,48 +384,38 @@ good_area: #endif /* CONFIG_8xx */ if (is_exec) { -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) - /* protection fault */ +#ifdef CONFIG_PPC_STD_MMU + /* Protection fault on exec go straight to failure on + * Hash based MMUs as they either don't support per-page + * execute permission, or if they do, it's handled already + * at the hash level. This test would probably have to + * be removed if we change the way this works to make hash + * processors use the same I/D cache coherency mechanism + * as embedded. + */ if (error_code & DSISR_PROTFAULT) goto bad_area; +#endif /* CONFIG_PPC_STD_MMU */ + /* * Allow execution from readable areas if the MMU does not * provide separate controls over reading and executing. + * + * Note: That code used to not be enabled for 4xx/BookE. + * It is now as I/D cache coherency for these is done at + * set_pte_at() time and I see no reason why the test + * below wouldn't be valid on those processors. This -may- + * break programs compiled with a really old ABI though. */ if (!(vma->vm_flags & VM_EXEC) && (cpu_has_feature(CPU_FTR_NOEXECUTE) || !(vma->vm_flags & (VM_READ | VM_WRITE)))) goto bad_area; -#else - pte_t *ptep; - pmd_t *pmdp; - - /* Since 4xx/Book-E supports per-page execute permission, - * we lazily flush dcache to icache. */ - ptep = NULL; - if (get_pteptr(mm, address, &ptep, &pmdp)) { - spinlock_t *ptl = pte_lockptr(mm, pmdp); - spin_lock(ptl); - if (pte_present(*ptep)) { - struct page *page = pte_page(*ptep); - - if (!test_bit(PG_arch_1, &page->flags)) { - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } - pte_update(ptep, 0, _PAGE_HWEXEC); - _tlbie(address, mm->context.id); - pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); - return 0; - } - pte_unmap_unlock(ptep, ptl); - } -#endif /* a write */ } else if (is_write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; /* a read */ } else { /* protection fault */ @@ -333,21 +430,52 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - survive: - ret = handle_mm_fault(mm, vma, address, is_write); - if (unlikely(ret & VM_FAULT_ERROR)) { - if (ret & VM_FAULT_OOM) - goto out_of_memory; - else if (ret & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); + fault = handle_mm_fault(mm, vma, address, flags); + if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { + rc = mm_fault_error(regs, address, fault); + if (rc >= MM_FAULT_RETURN) + goto bail; + else + rc = 0; } - if (ret & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; + + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + current->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, + regs, address); +#ifdef CONFIG_PPC_SMLPAR + if (firmware_has_feature(FW_FEATURE_CMO)) { + u32 page_ins; + + preempt_disable(); + page_ins = be32_to_cpu(get_lppaca()->page_ins); + page_ins += 1 << PAGE_FACTOR; + get_lppaca()->page_ins = cpu_to_be32(page_ins); + preempt_enable(); + } +#endif /* CONFIG_PPC_SMLPAR */ + } else { + current->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + up_read(&mm->mmap_sem); - return 0; + goto bail; bad_area: up_read(&mm->mmap_sem); @@ -356,44 +484,20 @@ bad_area_nosemaphore: /* User mode accesses cause a SIGSEGV */ if (user_mode(regs)) { _exception(SIGSEGV, regs, code, address); - return 0; + goto bail; } - if (is_exec && (error_code & DSISR_PROTFAULT) - && printk_ratelimit()) - printk(KERN_CRIT "kernel tried to execute NX-protected" - " page (%lx) - exploit attempt? (uid: %d)\n", - address, current->uid); + if (is_exec && (error_code & DSISR_PROTFAULT)) + printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected" + " page (%lx) - exploit attempt? (uid: %d)\n", + address, from_kuid(&init_user_ns, current_uid())); - return SIGSEGV; + rc = SIGSEGV; -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); - if (user_mode(regs)) - do_group_exit(SIGKILL); - return SIGKILL; +bail: + exception_exit(prev_state); + return rc; -do_sigbus: - up_read(&mm->mmap_sem); - if (user_mode(regs)) { - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, current); - return 0; - } - return SIGBUS; } /* @@ -404,6 +508,7 @@ do_sigbus: void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { const struct exception_table_entry *entry; + unsigned long *stackend; /* Are we prepared to handle this fault? */ if ((entry = search_exception_tables(regs->nip)) != NULL) { @@ -432,5 +537,9 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", regs->nip); + stackend = end_of_stack(current); + if (current != &init_task && *stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + die("Kernel access of bad area", regs, sig); } diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index c93a966b7e4..94cd728166d 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -2,7 +2,7 @@ * Modifications by Kumar Gala (galak@kernel.crashing.org) to support * E500 Book E processors. * - * Copyright 2004 Freescale Semiconductor, Inc + * Copyright 2004,2010 Freescale Semiconductor, Inc. * * This file contains the routines for initializing the MMU * on the 4xx series of chips. @@ -40,6 +40,7 @@ #include <linux/init.h> #include <linux/delay.h> #include <linux/highmem.h> +#include <linux/memblock.h> #include <asm/pgalloc.h> #include <asm/prom.h> @@ -49,41 +50,34 @@ #include <asm/mmu.h> #include <asm/uaccess.h> #include <asm/smp.h> -#include <asm/bootx.h> #include <asm/machdep.h> #include <asm/setup.h> +#include <asm/paca.h> + +#include "mmu_decl.h" -extern void loadcam_entry(unsigned int index); unsigned int tlbcam_index; -unsigned int num_tlbcam_entries; -static unsigned long __cam0, __cam1, __cam2; -extern unsigned long total_lowmem; -extern unsigned long __max_low_memory; -extern unsigned long __initial_memory_limit; -#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE - -#define NUM_TLBCAMS (16) - -struct tlbcam { - u32 MAS0; - u32 MAS1; - u32 MAS2; - u32 MAS3; - u32 MAS7; -} TLBCAM[NUM_TLBCAMS]; + +#define NUM_TLBCAMS (64) +struct tlbcam TLBCAM[NUM_TLBCAMS]; struct tlbcamrange { - unsigned long start; + unsigned long start; unsigned long limit; phys_addr_t phys; } tlbcam_addrs[NUM_TLBCAMS]; extern unsigned int tlbcam_index; +unsigned long tlbcam_sz(int idx) +{ + return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; +} + /* * Return PA for this VA if it is mapped by a CAM, or 0 */ -unsigned long v_mapped_by_tlbcam(unsigned long va) +phys_addr_t v_mapped_by_tlbcam(unsigned long va) { int b; for (b = 0; b < tlbcam_index; ++b) @@ -95,29 +89,30 @@ unsigned long v_mapped_by_tlbcam(unsigned long va) /* * Return VA for a given PA or 0 if not mapped */ -unsigned long p_mapped_by_tlbcam(unsigned long pa) +unsigned long p_mapped_by_tlbcam(phys_addr_t pa) { int b; for (b = 0; b < tlbcam_index; ++b) if (pa >= tlbcam_addrs[b].phys - && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) + && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) +tlbcam_addrs[b].phys) return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); return 0; } /* - * Set up one of the I/D BAT (block address translation) register pairs. - * The parameters are not checked; in particular size must be a power - * of 4 between 4k and 256M. + * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; + * in particular size must be a power of 4 between 4k and the max supported by + * an implementation; max may further be limited by what can be represented in + * an unsigned long (for example, 32-bit implementations cannot support a 4GB + * size). */ -void settlbcam(int index, unsigned long virt, phys_addr_t phys, - unsigned int size, int flags, unsigned int pid) +static void settlbcam(int index, unsigned long virt, phys_addr_t phys, + unsigned long size, unsigned long flags, unsigned int pid) { - unsigned int tsize, lz; + unsigned int tsize; - asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size)); - tsize = (21 - lz) / 2; + tsize = __ilog2(size) - 10; #ifdef CONFIG_SMP if ((flags & _PAGE_NO_CACHE) == 0) @@ -134,18 +129,16 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; - TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR; + TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + TLBCAM[index].MAS7 = (u64)phys >> 32; -#ifndef CONFIG_KGDB /* want user access for breakpoints */ - if (flags & _PAGE_USER) { + /* Below is unlikely -- only for large user pages or similar */ + if (pte_user(flags)) { TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); } -#else - TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; - TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); -#endif tlbcam_addrs[index].start = virt; tlbcam_addrs[index].limit = virt + size - 1; @@ -154,27 +147,77 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys, loadcam_entry(index); } -void invalidate_tlbcam_entry(int index) +unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys) { - TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index); - TLBCAM[index].MAS1 = ~MAS1_VALID; + unsigned int camsize = __ilog2(ram); + unsigned int align = __ffs(virt | phys); + unsigned long max_cam; - loadcam_entry(index); + if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + /* Convert (4^max) kB to (2^max) bytes */ + max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10; + camsize &= ~1U; + align &= ~1U; + } else { + /* Convert (2^max) kB to (2^max) bytes */ + max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10; + } + + if (camsize > align) + camsize = align; + if (camsize > max_cam) + camsize = max_cam; + + return 1UL << camsize; } -void __init cam_mapin_ram(unsigned long cam0, unsigned long cam1, - unsigned long cam2) +static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, + unsigned long ram, int max_cam_idx) { - settlbcam(0, PAGE_OFFSET, PPC_MEMSTART, cam0, _PAGE_KERNEL, 0); - tlbcam_index++; - if (cam1) { - tlbcam_index++; - settlbcam(1, PAGE_OFFSET+cam0, PPC_MEMSTART+cam0, cam1, _PAGE_KERNEL, 0); - } - if (cam2) { - tlbcam_index++; - settlbcam(2, PAGE_OFFSET+cam0+cam1, PPC_MEMSTART+cam0+cam1, cam2, _PAGE_KERNEL, 0); + int i; + unsigned long amount_mapped = 0; + + /* Calculate CAM values */ + for (i = 0; ram && i < max_cam_idx; i++) { + unsigned long cam_sz; + + cam_sz = calc_cam_sz(ram, virt, phys); + settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0); + + ram -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; } + tlbcam_index = i; + +#ifdef CONFIG_PPC64 + get_paca()->tcd.esel_next = i; + get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + get_paca()->tcd.esel_first = i; +#endif + + return amount_mapped; +} + +unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx) +{ + unsigned long virt = PAGE_OFFSET; + phys_addr_t phys = memstart_addr; + + return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx); +} + +#ifdef CONFIG_PPC32 + +#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) +#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" +#endif + +unsigned long __init mmu_mapin_ram(unsigned long top) +{ + return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; } /* @@ -185,53 +228,91 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } -unsigned long __init mmu_mapin_ram(void) +void __init adjust_total_lowmem(void) { - cam_mapin_ram(__cam0, __cam1, __cam2); + unsigned long ram; + int i; - return __cam0 + __cam1 + __cam2; -} + /* adjust lowmem size to __max_low_memory */ + ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); + i = switch_to_as1(); + __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM); + restore_to_as0(i, 0, 0, 1); -void __init -adjust_total_lowmem(void) + pr_info("Memory CAM mapping: "); + for (i = 0; i < tlbcam_index - 1; i++) + pr_cont("%lu/", tlbcam_sz(i) >> 20); + pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, + (unsigned int)((total_lowmem - __max_low_memory) >> 20)); + + memblock_set_current_limit(memstart_addr + __max_low_memory); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) { - unsigned long max_low_mem = MAX_LOW_MEM; - unsigned long cam_max = 0x10000000; - unsigned long ram; + phys_addr_t limit = first_memblock_base + first_memblock_size; - /* adjust CAM size to max_low_mem */ - if (max_low_mem < cam_max) - cam_max = max_low_mem; + /* 64M mapped initially according to head_fsl_booke.S */ + memblock_set_current_limit(min_t(u64, limit, 0x04000000)); +} - /* adjust lowmem size to max_low_mem */ - if (max_low_mem < total_lowmem) - ram = max_low_mem; - else - ram = total_lowmem; +#ifdef CONFIG_RELOCATABLE +int __initdata is_second_reloc; +notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) +{ + unsigned long base = KERNELBASE; - /* Calculate CAM values */ - __cam0 = 1UL << 2 * (__ilog2(ram) / 2); - if (__cam0 > cam_max) - __cam0 = cam_max; - ram -= __cam0; - if (ram) { - __cam1 = 1UL << 2 * (__ilog2(ram) / 2); - if (__cam1 > cam_max) - __cam1 = cam_max; - ram -= __cam1; - } - if (ram) { - __cam2 = 1UL << 2 * (__ilog2(ram) / 2); - if (__cam2 > cam_max) - __cam2 = cam_max; - ram -= __cam2; + kernstart_addr = start; + if (is_second_reloc) { + virt_phys_offset = PAGE_OFFSET - memstart_addr; + return; } - printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb," - " CAM2=%ldMb residual: %ldMb\n", - __cam0 >> 20, __cam1 >> 20, __cam2 >> 20, - (total_lowmem - __cam0 - __cam1 - __cam2) >> 20); - __max_low_memory = max_low_mem = __cam0 + __cam1 + __cam2; - __initial_memory_limit = __max_low_memory; + /* + * Relocatable kernel support based on processing of dynamic + * relocation entries. Before we get the real memstart_addr, + * We will compute the virt_phys_offset like this: + * virt_phys_offset = stext.run - kernstart_addr + * + * stext.run = (KERNELBASE & ~0x3ffffff) + + * (kernstart_addr & 0x3ffffff) + * When we relocate, we have : + * + * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff) + * + * hence: + * virt_phys_offset = (KERNELBASE & ~0x3ffffff) - + * (kernstart_addr & ~0x3ffffff) + * + */ + start &= ~0x3ffffff; + base &= ~0x3ffffff; + virt_phys_offset = base - start; + early_get_first_memblock_info(__va(dt_ptr), NULL); + /* + * We now get the memstart_addr, then we should check if this + * address is the same as what the PAGE_OFFSET map to now. If + * not we have to change the map of PAGE_OFFSET to memstart_addr + * and do a second relocation. + */ + if (start != memstart_addr) { + int n; + long offset = start - memstart_addr; + + is_second_reloc = 1; + n = switch_to_as1(); + /* map a 64M area for the second relocation */ + if (memstart_addr > start) + map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM); + else + map_mem_in_cams_addr(start, PAGE_OFFSET + offset, + 0x4000000, CONFIG_LOWMEM_CAM_NUM); + restore_to_as0(n, offset, __va(dt_ptr), 1); + /* We should never reach here */ + panic("Relocation error"); + } } +#endif +#endif diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c new file mode 100644 index 00000000000..d8746684f60 --- /dev/null +++ b/arch/powerpc/mm/gup.c @@ -0,0 +1,235 @@ +/* + * Lockless get_user_pages_fast for powerpc + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + */ +#undef DEBUG + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/vmstat.h> +#include <linux/pagemap.h> +#include <linux/rwsem.h> +#include <asm/pgtable.h> + +#ifdef __HAVE_ARCH_PTE_SPECIAL + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask, result; + pte_t *ptep; + + result = _PAGE_PRESENT|_PAGE_USER; + if (write) + result |= _PAGE_RW; + mask = result | _PAGE_SPECIAL; + + ptep = pte_offset_kernel(&pmd, addr); + do { + pte_t pte = ACCESS_ONCE(*ptep); + struct page *page; + /* + * Similar to the PMD case, NUMA hinting must take slow path + */ + if (pte_numa(pte)) + return 0; + + if ((pte_val(pte) & mask) != result) + return 0; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + if (!page_cache_get_speculative(page)) + return 0; + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + put_page(page); + return 0; + } + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = ACCESS_ONCE(*pmdp); + + next = pmd_addr_end(addr, end); + /* + * If we find a splitting transparent hugepage we + * return zero. That will result in taking the slow + * path which will call wait_split_huge_page() + * if the pmd is still in splitting state + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + return 0; + if (pmd_huge(pmd) || pmd_large(pmd)) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_numa(pmd)) + return 0; + + if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next, + write, pages, nr)) + return 0; + } else if (is_hugepd(pmdp)) { + if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT, + addr, next, write, pages, nr)) + return 0; + } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = ACCESS_ONCE(*pudp); + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (pud_huge(pud)) { + if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next, + write, pages, nr)) + return 0; + } else if (is_hugepd(pudp)) { + if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT, + addr, next, write, pages, nr)) + return 0; + } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + unsigned long flags; + pgd_t *pgdp; + int nr = 0; + + pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, len))) + return 0; + + pr_devel(" aligned: %lx .. %lx\n", start, end); + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables from being freed on powerpc. + * + * So long as we atomically load page table pointers versus teardown, + * we can follow the address down to the the page and take a ref on it. + */ + local_irq_save(flags); + + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = ACCESS_ONCE(*pgdp); + + pr_devel(" %016lx: normal pgd %p\n", addr, + (void *)pgd_val(pgd)); + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (pgd_huge(pgd)) { + if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next, + write, pages, &nr)) + break; + } else if (is_hugepd(pgdp)) { + if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, + addr, next, write, pages, &nr)) + break; + } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + + local_irq_restore(flags); + + return nr; +} + +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + int nr, ret; + + start &= PAGE_MASK; + nr = __get_user_pages_fast(start, nr_pages, write, pages); + ret = nr; + + if (nr < nr_pages) { + pr_devel(" slow path ! nr = %d\n", nr); + + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + nr_pages - nr, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + } + + return ret; +} + +#endif /* __HAVE_ARCH_PTE_SPECIAL */ diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index e10d76a860d..115347f74ce 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -1,6 +1,4 @@ /* - * $Id: hashtable.S,v 1.6 1999/10/08 01:56:15 paulus Exp $ - * * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP @@ -38,46 +36,16 @@ mmu_hash_lock: #endif /* CONFIG_SMP */ /* - * Sync CPUs with hash_page taking & releasing the hash - * table lock - */ -#ifdef CONFIG_SMP - .text -_GLOBAL(hash_page_sync) - mfmsr r10 - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ - mtmsr r0 - lis r8,mmu_hash_lock@h - ori r8,r8,mmu_hash_lock@l - lis r0,0x0fff - b 10f -11: lwz r6,0(r8) - cmpwi 0,r6,0 - bne 11b -10: lwarx r6,0,r8 - cmpwi 0,r6,0 - bne- 11b - stwcx. r0,0,r8 - bne- 10b - isync - eieio - li r0,0 - stw r0,0(r8) - mtmsr r10 - blr -#endif /* CONFIG_SMP */ - -/* * Load a PTE into the hash table, if possible. * The address is in r4, and r3 contains an access flag: * _PAGE_RW (0x400) if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. - * SPRG3 contains the physical address of the current task's thread. + * SPRG_THREAD contains the physical address of the current task's thread. * * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE * in the hash table and returns from the exception. - * Uses r0, r3 - r8, ctr, lr. + * Uses r0, r3 - r8, r10, ctr, lr. */ .text _GLOBAL(hash_page) @@ -100,7 +68,7 @@ _GLOBAL(hash_page) /* Get PTE (linux-style) and check access */ lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 - mfspr r8,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ lwz r5,PGDIR(r8) /* virt page-table root */ blt+ 112f /* assume user more likely */ @@ -108,9 +76,15 @@ _GLOBAL(hash_page) addi r5,r5,swapper_pg_dir@l /* kernel page table */ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ 112: add r5,r5,r7 /* convert to phys addr */ +#ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ lwz r8,0(r5) /* get pmd entry */ rlwinm. r8,r8,0,0,19 /* extract address of pte page */ +#else + rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */ + lwzx r8,r8,r5 /* Get L1 entry */ + rlwinm. r8,r8,0,0,20 /* extract pt base address */ +#endif #ifdef CONFIG_SMP beq- hash_page_out /* return if no mapping */ #else @@ -120,7 +94,11 @@ _GLOBAL(hash_page) to the address following the rfi. */ beqlr- #endif +#ifndef CONFIG_PTE_64BIT rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */ +#else + rlwimi r8,r4,23,20,28 /* compute pte address */ +#endif rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE @@ -129,9 +107,15 @@ _GLOBAL(hash_page) * because almost always, there won't be a permission violation * and there won't already be an HPTE, and thus we will have * to update the PTE to set _PAGE_HASHPTE. -- paulus. + * + * If PTE_64BIT is set, the low word is the flags word; use that + * word for locking since it contains all the interesting bits. */ +#if (PTE_FLAGS_OFFSET != 0) + addi r8,r8,PTE_FLAGS_OFFSET +#endif retry: - lwarx r6,0,r8 /* get linux-style pte */ + lwarx r6,0,r8 /* get linux-style pte, flag word */ andc. r5,r3,r6 /* check access & ~permission */ #ifdef CONFIG_SMP bne- hash_page_out /* return if access not permitted */ @@ -139,6 +123,15 @@ retry: bnelr- #endif or r5,r0,r6 /* set accessed/dirty bits */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ stwcx. r5,0,r8 /* attempt to update PTE */ bne- retry /* retry if someone got there first */ @@ -191,7 +184,7 @@ _GLOBAL(add_hash_page) add r3,r3,r0 /* note create_hpte trims to 24 bits */ #ifdef CONFIG_SMP - rlwinm r8,r1,0,0,18 /* use cpu number to make tag */ + CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */ lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */ oris r8,r8,12 #endif /* CONFIG_SMP */ @@ -205,9 +198,9 @@ _GLOBAL(add_hash_page) * we can't take a hash table miss (assuming the code is * covered by a BAT). -- paulus */ - mfmsr r10 + mfmsr r9 SYNC - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */ rlwinm r0,r0,0,28,26 /* clear MSR_DR */ mtmsr r0 SYNC_601 @@ -216,14 +209,14 @@ _GLOBAL(add_hash_page) tophys(r7,0) #ifdef CONFIG_SMP - addis r9,r7,mmu_hash_lock@ha - addi r9,r9,mmu_hash_lock@l -10: lwarx r0,0,r9 /* take the mmu_hash_lock */ + addis r6,r7,mmu_hash_lock@ha + addi r6,r6,mmu_hash_lock@l +10: lwarx r0,0,r6 /* take the mmu_hash_lock */ cmpi 0,r0,0 bne- 11f - stwcx. r8,0,r9 + stwcx. r8,0,r6 beq+ 12f -11: lwz r0,0(r9) +11: lwz r0,0(r6) cmpi 0,r0,0 beq 10b b 11b @@ -236,10 +229,24 @@ _GLOBAL(add_hash_page) * HPTE, so we just unlock and return. */ mr r8,r5 +#ifndef CONFIG_PTE_64BIT rlwimi r8,r4,22,20,29 +#else + rlwimi r8,r4,23,20,28 + addi r8,r8,PTE_FLAGS_OFFSET +#endif 1: lwarx r6,0,r8 andi. r0,r6,_PAGE_HASHPTE bne 9f /* if HASHPTE already set, done */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ ori r5,r6,_PAGE_HASHPTE stwcx. r5,0,r8 bne- 1b @@ -248,13 +255,15 @@ _GLOBAL(add_hash_page) 9: #ifdef CONFIG_SMP + addis r6,r7,mmu_hash_lock@ha + addi r6,r6,mmu_hash_lock@l eieio li r0,0 - stw r0,0(r9) /* clear mmu_hash_lock */ + stw r0,0(r6) /* clear mmu_hash_lock */ #endif /* reenable interrupts and DR */ - mtmsr r10 + mtmsr r9 SYNC_601 isync @@ -269,7 +278,8 @@ _GLOBAL(add_hash_page) * r5 contains the linux PTE, r6 contains the old value of the * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the * offset to be added to addresses (0 if the MMU is on, - * -KERNELBASE if it is off). + * -KERNELBASE if it is off). r10 contains the upper half of + * the PTE if CONFIG_PTE_64BIT. * On SMP, the caller should have the mmu_hash_lock held. * We assume that the caller has (or will) set the _PAGE_HASHPTE * bit in the linux PTE in memory. The value passed in r6 should @@ -287,7 +297,7 @@ Hash_bits = 12 /* e.g. 256kB hash table */ Hash_msk = (((1 << Hash_bits) - 1) * 64) /* defines for the PTE format for 32-bit PPCs */ -#define PTE_SIZE 8 +#define HPTE_SIZE 8 #define PTEG_SIZE 64 #define LG_PTEG_SIZE 6 #define LDPTEu lwzu @@ -310,11 +320,16 @@ _GLOBAL(create_hpte) and r8,r8,r0 /* writable if _RW & _DIRTY */ rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ - ori r8,r8,0xe14 /* clear out reserved bits and M */ + ori r8,r8,0xe04 /* clear out reserved bits */ andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ BEGIN_FTR_SECTION - ori r8,r8,_PAGE_COHERENT /* set M (coherence required) */ -END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT) + rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ +END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) +#ifdef CONFIG_PTE_64BIT + /* Put the XPN bits into the PTE */ + rlwimi r8,r10,8,20,22 + rlwimi r8,r10,2,29,29 +#endif /* Construct the high word of the PPC-style PTE (r5) */ rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ @@ -344,8 +359,8 @@ _GLOBAL(hash_page_patch_A) /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ mtctr r0 - addi r4,r3,-PTE_SIZE -1: LDPTEu r6,PTE_SIZE(r4) /* get next PTE */ + addi r4,r3,-HPTE_SIZE +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ CMPPTE 0,r6,r5 bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ beq+ found_slot @@ -355,9 +370,9 @@ _GLOBAL(hash_page_patch_A) _GLOBAL(hash_page_patch_B) xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ xori r4,r4,(-PTEG_SIZE & 0xffff) - addi r4,r4,-PTE_SIZE + addi r4,r4,-HPTE_SIZE mtctr r0 -2: LDPTEu r6,PTE_SIZE(r4) +2: LDPTEu r6,HPTE_SIZE(r4) CMPPTE 0,r6,r5 bdnzf 2,2b beq+ found_slot @@ -365,8 +380,8 @@ _GLOBAL(hash_page_patch_B) /* Search the primary PTEG for an empty slot */ 10: mtctr r0 - addi r4,r3,-PTE_SIZE /* search primary PTEG */ -1: LDPTEu r6,PTE_SIZE(r4) /* get next PTE */ + addi r4,r3,-HPTE_SIZE /* search primary PTEG */ +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ TST_V(r6) /* test valid bit */ bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ beq+ found_empty @@ -382,9 +397,9 @@ _GLOBAL(hash_page_patch_B) _GLOBAL(hash_page_patch_C) xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ xori r4,r4,(-PTEG_SIZE & 0xffff) - addi r4,r4,-PTE_SIZE + addi r4,r4,-HPTE_SIZE mtctr r0 -2: LDPTEu r6,PTE_SIZE(r4) +2: LDPTEu r6,HPTE_SIZE(r4) TST_V(r6) bdnzf 2,2b beq+ found_empty @@ -411,11 +426,11 @@ _GLOBAL(hash_page_patch_C) 1: addis r4,r7,next_slot@ha /* get next evict slot */ lwz r6,next_slot@l(r4) - addi r6,r6,PTE_SIZE /* search for candidate */ - andi. r6,r6,7*PTE_SIZE + addi r6,r6,HPTE_SIZE /* search for candidate */ + andi. r6,r6,7*HPTE_SIZE stw r6,next_slot@l(r4) add r4,r3,r6 - LDPTE r0,PTE_SIZE/2(r4) /* get PTE second word */ + LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */ clrrwi r0,r0,12 lis r6,etext@h ori r6,r6,etext@l /* get etext */ @@ -428,7 +443,7 @@ _GLOBAL(hash_page_patch_C) found_empty: STPTE r5,0(r4) found_slot: - STPTE r8,PTE_SIZE/2(r4) + STPTE r8,HPTE_SIZE/2(r4) #else /* CONFIG_SMP */ /* @@ -454,7 +469,7 @@ found_slot: STPTE r5,0(r4) sync TLBSYNC - STPTE r8,PTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */ + STPTE r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */ sync SET_V(r5) STPTE r5,0(r4) /* finally set V bit in PTE */ @@ -501,14 +516,18 @@ _GLOBAL(flush_hash_pages) isync /* First find a PTE in the range that has _PAGE_HASHPTE set */ +#ifndef CONFIG_PTE_64BIT rlwimi r5,r4,22,20,29 -1: lwz r0,0(r5) +#else + rlwimi r5,r4,23,20,28 +#endif +1: lwz r0,PTE_FLAGS_OFFSET(r5) cmpwi cr1,r6,1 andi. r0,r0,_PAGE_HASHPTE bne 2f ble cr1,19f addi r4,r4,0x1000 - addi r5,r5,4 + addi r5,r5,PTE_SIZE addi r6,r6,-1 b 1b @@ -526,7 +545,7 @@ _GLOBAL(flush_hash_pages) #ifdef CONFIG_SMP addis r9,r7,mmu_hash_lock@ha addi r9,r9,mmu_hash_lock@l - rlwinm r8,r1,0,0,18 + CURRENT_THREAD_INFO(r8, r1) add r8,r8,r7 lwz r8,TI_CPU(r8) oris r8,r8,9 @@ -547,7 +566,10 @@ _GLOBAL(flush_hash_pages) * already clear, we're done (for this pte). If not, * clear it (atomically) and proceed. -- paulus. */ -33: lwarx r8,0,r5 /* fetch the pte */ +#if (PTE_FLAGS_OFFSET != 0) + addi r5,r5,PTE_FLAGS_OFFSET +#endif +33: lwarx r8,0,r5 /* fetch the pte flags word */ andi. r0,r8,_PAGE_HASHPTE beq 8f /* done if HASHPTE is already clear */ rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */ @@ -564,8 +586,8 @@ _GLOBAL(flush_hash_patch_A) /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ li r0,8 /* PTEs/group */ mtctr r0 - addi r12,r8,-PTE_SIZE -1: LDPTEu r0,PTE_SIZE(r12) /* get next PTE */ + addi r12,r8,-HPTE_SIZE +1: LDPTEu r0,HPTE_SIZE(r12) /* get next PTE */ CMPPTE 0,r0,r11 bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ beq+ 3f @@ -576,9 +598,9 @@ _GLOBAL(flush_hash_patch_A) _GLOBAL(flush_hash_patch_B) xoris r12,r8,Hash_msk>>16 /* compute secondary hash */ xori r12,r12,(-PTEG_SIZE & 0xffff) - addi r12,r12,-PTE_SIZE + addi r12,r12,-HPTE_SIZE mtctr r0 -2: LDPTEu r0,PTE_SIZE(r12) +2: LDPTEu r0,HPTE_SIZE(r12) CMPPTE 0,r0,r11 bdnzf 2,2b xori r11,r11,PTE_H /* clear H again */ @@ -592,7 +614,7 @@ _GLOBAL(flush_hash_patch_B) 8: ble cr1,9f /* if all ptes checked */ 81: addi r6,r6,-1 - addi r5,r5,4 /* advance to next pte */ + addi r5,r5,PTE_SIZE addi r4,r4,0x1000 lwz r0,0(r5) /* check next pte */ cmpwi cr1,r6,1 @@ -611,3 +633,80 @@ _GLOBAL(flush_hash_patch_B) SYNC_601 isync blr + +/* + * Flush an entry from the TLB + */ +_GLOBAL(_tlbie) +#ifdef CONFIG_SMP + CURRENT_THREAD_INFO(r8, r1) + lwz r8,TI_CPU(r8) + oris r8,r8,11 + mfmsr r10 + SYNC + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear DR */ + mtmsr r0 + SYNC_601 + isync + lis r9,mmu_hash_lock@h + ori r9,r9,mmu_hash_lock@l + tophys(r9,r9) +10: lwarx r7,0,r9 + cmpwi 0,r7,0 + bne- 10b + stwcx. r8,0,r9 + bne- 10b + eieio + tlbie r3 + sync + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ + mtmsr r10 + SYNC_601 + isync +#else /* CONFIG_SMP */ + tlbie r3 + sync +#endif /* CONFIG_SMP */ + blr + +/* + * Flush the entire TLB. 603/603e only + */ +_GLOBAL(_tlbia) +#if defined(CONFIG_SMP) + CURRENT_THREAD_INFO(r8, r1) + lwz r8,TI_CPU(r8) + oris r8,r8,10 + mfmsr r10 + SYNC + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear DR */ + mtmsr r0 + SYNC_601 + isync + lis r9,mmu_hash_lock@h + ori r9,r9,mmu_hash_lock@l + tophys(r9,r9) +10: lwarx r7,0,r9 + cmpwi 0,r7,0 + bne- 10b + stwcx. r8,0,r9 + bne- 10b + sync + tlbia + sync + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ + mtmsr r10 + SYNC_601 + isync +#else /* CONFIG_SMP */ + sync + tlbia + sync +#endif /* CONFIG_SMP */ + blr diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S index 21d24848647..057cbbb4c57 100644 --- a/arch/powerpc/mm/hash_low_64.S +++ b/arch/powerpc/mm/hash_low_64.S @@ -34,14 +34,6 @@ * | CR save area (SP + 8) * SP ---> +-- Back chain (SP + 0) */ -#define STACKFRAMESIZE 256 - -/* Save parameters offsets */ -#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8) - -/* Save non-volatile offsets */ -#define STK_REG(i) (112 + ((i)-14)*8) - #ifndef CONFIG_PPC_64K_PAGES @@ -64,25 +56,22 @@ _GLOBAL(__hash_page_4K) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) /* Save all params that we need after a function call */ - std r6,STK_PARM(r6)(r1) - std r8,STK_PARM(r8)(r1) - std r9,STK_PARM(r9)(r1) + std r6,STK_PARAM(R6)(r1) + std r8,STK_PARAM(R8)(r1) + std r9,STK_PARAM(R9)(r1) - /* Add _PAGE_PRESENT to access */ - ori r4,r4,_PAGE_PRESENT - /* Save non-volatile registers. * r31 will hold "old PTE" * r30 is "new PTE" - * r29 is "va" + * r29 is vpn * r28 is a hash value * r27 is hashtab mask (maybe dynamic patched instead ?) */ - std r27,STK_REG(r27)(r1) - std r28,STK_REG(r28)(r1) - std r29,STK_REG(r29)(r1) - std r30,STK_REG(r30)(r1) - std r31,STK_REG(r31)(r1) + std r27,STK_REG(R27)(r1) + std r28,STK_REG(R28)(r1) + std r29,STK_REG(R29)(r1) + std r30,STK_REG(R30)(r1) + std r31,STK_REG(R31)(r1) /* Step 1: * @@ -121,26 +110,34 @@ _GLOBAL(__hash_page_4K) BEGIN_FTR_SECTION cmpdi r9,0 /* check segment size */ bne 3f -END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) - /* Calc va and put it in r29 */ - rldicr r29,r5,28,63-28 - rldicl r3,r3,0,36 - or r29,r3,r29 - - /* Calculate hash value for primary slot and store it in r28 */ - rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ - rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ - xor r28,r5,r0 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) + or r29,r28,r29 + /* + * Calculate hash value for primary slot and store it in r28 + * r3 = va, r5 = vsid + * r0 = (va >> 12) & ((1ul << (28 - 12)) -1) + */ + rldicl r0,r3,64-12,48 + xor r28,r5,r0 /* hash */ b 4f -3: /* Calc VA and hash in r29 and r28 for 1T segment */ - sldi r29,r5,40 /* vsid << 40 */ - clrldi r3,r3,24 /* ea & 0xffffffffff */ - rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */ - clrldi r5,r5,40 /* vsid & 0xffffff */ - rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */ - xor r28,r28,r5 - or r29,r3,r29 /* VA */ +3: /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) + or r29,r28,r29 + + /* + * calculate hash value for primary slot and + * store it in r28 for 1T segment + * r3 = va, r5 = vsid + */ + sldi r28,r5,25 /* vsid << 25 */ + /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */ + rldicl r0,r3,64-12,36 + xor r28,r28,r5 /* vsid ^ ( vsid << 25) */ xor r28,r28,r0 /* hash */ /* Convert linux PTE bits into HW equivalents */ @@ -151,7 +148,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ andc r0,r30,r0 /* r0 = pte & ~r0 */ rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */ + /* + * Always add "C" bit for perf. Memory coherence is always enabled + */ + ori r3,r3,HPTE_R_C | HPTE_R_M /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) @@ -159,13 +159,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) BEGIN_FTR_SECTION mr r4,r30 mr r5,r7 - bl .hash_page_do_lazy_icache + bl hash_page_do_lazy_icache END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) /* At this point, r3 contains new PP bits, save them in * place of "access" in the param area (sic) */ - std r3,STK_PARM(r4)(r1) + std r3,STK_PARAM(R4)(r1) /* Get htab_hash_mask */ ld r4,htab_hash_mask@got(2) @@ -195,12 +195,14 @@ htab_insert_pte: rldicr r3,r0,3,63-3 /* r3 = (hash & mask) << 3 */ /* Call ppc_md.hpte_insert */ - ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ - mr r4,r29 /* Retreive va */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ li r7,0 /* !bolted, !secondary */ li r8,MMU_PAGE_4K /* page size */ - ld r9,STK_PARM(r9)(r1) /* segment size */ -_GLOBAL(htab_call_hpte_insert1) + li r9,MMU_PAGE_4K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl htab_call_hpte_insert1 +htab_call_hpte_insert1: bl . /* Patched by htab_finish_init() */ cmpdi 0,r3,0 bge htab_pte_insert_ok /* Insertion successful */ @@ -218,12 +220,14 @@ _GLOBAL(htab_call_hpte_insert1) rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ /* Call ppc_md.hpte_insert */ - ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ - mr r4,r29 /* Retreive va */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ li r7,HPTE_V_SECONDARY /* !bolted, secondary */ li r8,MMU_PAGE_4K /* page size */ - ld r9,STK_PARM(r9)(r1) /* segment size */ -_GLOBAL(htab_call_hpte_insert2) + li r9,MMU_PAGE_4K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl htab_call_hpte_insert2 +htab_call_hpte_insert2: bl . /* Patched by htab_finish_init() */ cmpdi 0,r3,0 bge+ htab_pte_insert_ok /* Insertion successful */ @@ -240,7 +244,8 @@ _GLOBAL(htab_call_hpte_insert2) 2: and r0,r5,r27 rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ /* Call ppc_md.hpte_remove */ -_GLOBAL(htab_call_hpte_remove) +.globl htab_call_hpte_remove +htab_call_hpte_remove: bl . /* Patched by htab_finish_init() */ /* Try all again */ @@ -258,15 +263,15 @@ htab_pte_insert_ok: * (maybe add eieio may be good still ?) */ htab_write_out_pte: - ld r6,STK_PARM(r6)(r1) + ld r6,STK_PARAM(R6)(r1) std r30,0(r6) li r3, 0 htab_bail: - ld r27,STK_REG(r27)(r1) - ld r28,STK_REG(r28)(r1) - ld r29,STK_REG(r29)(r1) - ld r30,STK_REG(r30)(r1) - ld r31,STK_REG(r31)(r1) + ld r27,STK_REG(R27)(r1) + ld r28,STK_REG(R28)(r1) + ld r29,STK_REG(R29)(r1) + ld r30,STK_REG(R30)(r1) + ld r31,STK_REG(R31)(r1) addi r1,r1,STACKFRAMESIZE ld r0,16(r1) mtlr r0 @@ -289,11 +294,13 @@ htab_modify_pte: add r3,r0,r3 /* add slot idx */ /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* va */ - li r6,MMU_PAGE_4K /* page size */ - ld r7,STK_PARM(r9)(r1) /* segment size */ - ld r8,STK_PARM(r8)(r1) /* get "local" param */ -_GLOBAL(htab_call_hpte_updatepp) + mr r5,r29 /* vpn */ + li r6,MMU_PAGE_4K /* base page size */ + li r7,MMU_PAGE_4K /* actual page size */ + ld r8,STK_PARAM(R9)(r1) /* segment size */ + ld r9,STK_PARAM(R8)(r1) /* get "local" param */ +.globl htab_call_hpte_updatepp +htab_call_hpte_updatepp: bl . /* Patched by htab_finish_init() */ /* if we failed because typically the HPTE wasn't really here @@ -315,7 +322,7 @@ htab_wrong_access: htab_pte_insert_failure: /* Bail out restoring old PTE */ - ld r6,STK_PARM(r6)(r1) + ld r6,STK_PARAM(R6)(r1) std r31,0(r6) li r3,-1 b htab_bail @@ -343,29 +350,26 @@ _GLOBAL(__hash_page_4K) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) /* Save all params that we need after a function call */ - std r6,STK_PARM(r6)(r1) - std r8,STK_PARM(r8)(r1) - std r9,STK_PARM(r9)(r1) - - /* Add _PAGE_PRESENT to access */ - ori r4,r4,_PAGE_PRESENT + std r6,STK_PARAM(R6)(r1) + std r8,STK_PARAM(R8)(r1) + std r9,STK_PARAM(R9)(r1) /* Save non-volatile registers. * r31 will hold "old PTE" * r30 is "new PTE" - * r29 is "va" + * r29 is vpn * r28 is a hash value * r27 is hashtab mask (maybe dynamic patched instead ?) * r26 is the hidx mask * r25 is the index in combo page */ - std r25,STK_REG(r25)(r1) - std r26,STK_REG(r26)(r1) - std r27,STK_REG(r27)(r1) - std r28,STK_REG(r28)(r1) - std r29,STK_REG(r29)(r1) - std r30,STK_REG(r30)(r1) - std r31,STK_REG(r31)(r1) + std r25,STK_REG(R25)(r1) + std r26,STK_REG(R26)(r1) + std r27,STK_REG(R27)(r1) + std r28,STK_REG(R28)(r1) + std r29,STK_REG(R29)(r1) + std r30,STK_REG(R30)(r1) + std r31,STK_REG(R31)(r1) /* Step 1: * @@ -388,7 +392,7 @@ _GLOBAL(__hash_page_4K) */ rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ or r30,r30,r31 - ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED oris r30,r30,_PAGE_COMBO@h /* Write the linux PTE atomically (setting busy) */ stdcx. r30,0,r6 @@ -407,26 +411,42 @@ _GLOBAL(__hash_page_4K) BEGIN_FTR_SECTION cmpdi r9,0 /* check segment size */ bne 3f -END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) - /* Calc va and put it in r29 */ - rldicr r29,r5,28,63-28 /* r29 = (vsid << 28) */ - rldicl r3,r3,0,36 /* r3 = (ea & 0x0fffffff) */ - or r29,r3,r29 /* r29 = va */ - - /* Calculate hash value for primary slot and store it in r28 */ - rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ - rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ - xor r28,r5,r0 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT - VPN_SHIFT + /* + * clrldi r3,r3,64 - SID_SHIFT --> ea & 0xfffffff + * srdi r28,r3,VPN_SHIFT + */ + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) + or r29,r28,r29 + /* + * Calculate hash value for primary slot and store it in r28 + * r3 = va, r5 = vsid + * r0 = (va >> 12) & ((1ul << (28 - 12)) -1) + */ + rldicl r0,r3,64-12,48 + xor r28,r5,r0 /* hash */ b 4f -3: /* Calc VA and hash in r29 and r28 for 1T segment */ - sldi r29,r5,40 /* vsid << 40 */ - clrldi r3,r3,24 /* ea & 0xffffffffff */ - rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */ - clrldi r5,r5,40 /* vsid & 0xffffff */ - rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */ - xor r28,r28,r5 - or r29,r3,r29 /* VA */ +3: /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT + /* + * clrldi r3,r3,64 - SID_SHIFT_1T --> ea & 0xffffffffff + * srdi r28,r3,VPN_SHIFT + */ + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) + or r29,r28,r29 + + /* + * Calculate hash value for primary slot and + * store it in r28 for 1T segment + * r3 = va, r5 = vsid + */ + sldi r28,r5,25 /* vsid << 25 */ + /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */ + rldicl r0,r3,64-12,36 + xor r28,r28,r5 /* vsid ^ ( vsid << 25) */ xor r28,r28,r0 /* hash */ /* Convert linux PTE bits into HW equivalents */ @@ -444,7 +464,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ andc r0,r3,r0 /* r0 = pte & ~r0 */ rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */ + /* + * Always add "C" bit for perf. Memory coherence is always enabled + */ + ori r3,r3,HPTE_R_C | HPTE_R_M /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) @@ -452,13 +475,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) BEGIN_FTR_SECTION mr r4,r30 mr r5,r7 - bl .hash_page_do_lazy_icache + bl hash_page_do_lazy_icache END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) /* At this point, r3 contains new PP bits, save them in * place of "access" in the param area (sic) */ - std r3,STK_PARM(r4)(r1) + std r3,STK_PARAM(R4)(r1) /* Get htab_hash_mask */ ld r4,htab_hash_mask@got(2) @@ -468,7 +491,7 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) * go to out-of-line code to try to modify the HPTE. We look for * the bit at (1 >> (index + 32)) */ - andi. r0,r31,_PAGE_HASHPTE + rldicl. r0,r31,64-12,48 li r26,0 /* Default hidx */ beq htab_insert_pte @@ -479,8 +502,8 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) andis. r0,r31,_PAGE_COMBO@h beq htab_inval_old_hpte - ld r6,STK_PARM(r6)(r1) - ori r26,r6,0x8000 /* Load the hidx mask */ + ld r6,STK_PARAM(R6)(r1) + ori r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */ ld r26,0(r26) addi r5,r25,36 /* Check actual HPTE_SUB bit, this */ rldcr. r0,r31,r5,0 /* must match pgtable.h definition */ @@ -501,12 +524,14 @@ htab_special_pfn: rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ /* Call ppc_md.hpte_insert */ - ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ - mr r4,r29 /* Retreive va */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ li r7,0 /* !bolted, !secondary */ li r8,MMU_PAGE_4K /* page size */ - ld r9,STK_PARM(r9)(r1) /* segment size */ -_GLOBAL(htab_call_hpte_insert1) + li r9,MMU_PAGE_4K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl htab_call_hpte_insert1 +htab_call_hpte_insert1: bl . /* patched by htab_finish_init() */ cmpdi 0,r3,0 bge htab_pte_insert_ok /* Insertion successful */ @@ -528,12 +553,14 @@ _GLOBAL(htab_call_hpte_insert1) rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ /* Call ppc_md.hpte_insert */ - ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ - mr r4,r29 /* Retreive va */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ li r7,HPTE_V_SECONDARY /* !bolted, secondary */ li r8,MMU_PAGE_4K /* page size */ - ld r9,STK_PARM(r9)(r1) /* segment size */ -_GLOBAL(htab_call_hpte_insert2) + li r9,MMU_PAGE_4K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl htab_call_hpte_insert2 +htab_call_hpte_insert2: bl . /* patched by htab_finish_init() */ cmpdi 0,r3,0 bge+ htab_pte_insert_ok /* Insertion successful */ @@ -550,7 +577,8 @@ _GLOBAL(htab_call_hpte_insert2) 2: and r0,r5,r27 rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ /* Call ppc_md.hpte_remove */ -_GLOBAL(htab_call_hpte_remove) +.globl htab_call_hpte_remove +htab_call_hpte_remove: bl . /* patched by htab_finish_init() */ /* Try all again */ @@ -561,13 +589,17 @@ _GLOBAL(htab_call_hpte_remove) * useless now that the segment has been switched to 4k pages. */ htab_inval_old_hpte: - mr r3,r29 /* virtual addr */ + mr r3,r29 /* vpn */ mr r4,r31 /* PTE.pte */ li r5,0 /* PTE.hidx */ li r6,MMU_PAGE_64K /* psize */ - ld r7,STK_PARM(r9)(r1) /* ssize */ - ld r8,STK_PARM(r8)(r1) /* local */ - bl .flush_hash_page + ld r7,STK_PARAM(R9)(r1) /* ssize */ + ld r8,STK_PARAM(R8)(r1) /* local */ + bl flush_hash_page + /* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */ + lis r0,_PAGE_HPTE_SUB@h + ori r0,r0,_PAGE_HPTE_SUB@l + andc r30,r30,r0 b htab_insert_pte htab_bail_ok: @@ -578,7 +610,7 @@ htab_pte_insert_ok: /* Insert slot number & secondary bit in PTE second half, * clear _PAGE_BUSY and set approriate HPTE slot bit */ - ld r6,STK_PARM(r6)(r1) + ld r6,STK_PARAM(R6)(r1) li r0,_PAGE_BUSY andc r30,r30,r0 /* HPTE SUB bit */ @@ -593,19 +625,19 @@ htab_pte_insert_ok: sld r4,r4,r5 andc r26,r26,r4 or r26,r26,r3 - ori r5,r6,0x8000 + ori r5,r6,PTE_PAGE_HIDX_OFFSET std r26,0(r5) lwsync std r30,0(r6) li r3, 0 htab_bail: - ld r25,STK_REG(r25)(r1) - ld r26,STK_REG(r26)(r1) - ld r27,STK_REG(r27)(r1) - ld r28,STK_REG(r28)(r1) - ld r29,STK_REG(r29)(r1) - ld r30,STK_REG(r30)(r1) - ld r31,STK_REG(r31)(r1) + ld r25,STK_REG(R25)(r1) + ld r26,STK_REG(R26)(r1) + ld r27,STK_REG(R27)(r1) + ld r28,STK_REG(R28)(r1) + ld r29,STK_REG(R29)(r1) + ld r30,STK_REG(R30)(r1) + ld r31,STK_REG(R31)(r1) addi r1,r1,STACKFRAMESIZE ld r0,16(r1) mtlr r0 @@ -630,11 +662,13 @@ htab_modify_pte: add r3,r0,r3 /* add slot idx */ /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* va */ - li r6,MMU_PAGE_4K /* page size */ - ld r7,STK_PARM(r9)(r1) /* segment size */ - ld r8,STK_PARM(r8)(r1) /* get "local" param */ -_GLOBAL(htab_call_hpte_updatepp) + mr r5,r29 /* vpn */ + li r6,MMU_PAGE_4K /* base page size */ + li r7,MMU_PAGE_4K /* actual page size */ + ld r8,STK_PARAM(R9)(r1) /* segment size */ + ld r9,STK_PARAM(R8)(r1) /* get "local" param */ +.globl htab_call_hpte_updatepp +htab_call_hpte_updatepp: bl . /* patched by htab_finish_init() */ /* if we failed because typically the HPTE wasn't really here @@ -646,7 +680,7 @@ _GLOBAL(htab_call_hpte_updatepp) /* Clear the BUSY bit and Write out the PTE */ li r0,_PAGE_BUSY andc r30,r30,r0 - ld r6,STK_PARM(r6)(r1) + ld r6,STK_PARAM(R6)(r1) std r30,0(r6) li r3,0 b htab_bail @@ -659,7 +693,7 @@ htab_wrong_access: htab_pte_insert_failure: /* Bail out restoring old PTE */ - ld r6,STK_PARM(r6)(r1) + ld r6,STK_PARAM(R6)(r1) std r31,0(r6) li r3,-1 b htab_bail @@ -679,25 +713,22 @@ _GLOBAL(__hash_page_64K) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) /* Save all params that we need after a function call */ - std r6,STK_PARM(r6)(r1) - std r8,STK_PARM(r8)(r1) - std r9,STK_PARM(r9)(r1) - - /* Add _PAGE_PRESENT to access */ - ori r4,r4,_PAGE_PRESENT + std r6,STK_PARAM(R6)(r1) + std r8,STK_PARAM(R8)(r1) + std r9,STK_PARAM(R9)(r1) /* Save non-volatile registers. * r31 will hold "old PTE" * r30 is "new PTE" - * r29 is "va" + * r29 is vpn * r28 is a hash value * r27 is hashtab mask (maybe dynamic patched instead ?) */ - std r27,STK_REG(r27)(r1) - std r28,STK_REG(r28)(r1) - std r29,STK_REG(r29)(r1) - std r30,STK_REG(r30)(r1) - std r31,STK_REG(r31)(r1) + std r27,STK_REG(R27)(r1) + std r28,STK_REG(R28)(r1) + std r29,STK_REG(R29)(r1) + std r30,STK_REG(R30)(r1) + std r31,STK_REG(R31)(r1) /* Step 1: * @@ -720,13 +751,13 @@ BEGIN_FTR_SECTION andi. r0,r31,_PAGE_NO_CACHE /* If so, bail out and refault as a 4k page */ bne- ht64_bail_ok -END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE) +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_CI_LARGE_PAGE) /* Prepare new PTE value (turn access RW into DIRTY, then - * add BUSY,HASHPTE and ACCESSED) + * add BUSY and ACCESSED) */ rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ or r30,r30,r31 - ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED /* Write the linux PTE atomically (setting busy) */ stdcx. r30,0,r6 bne- 1b @@ -741,26 +772,33 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE) BEGIN_FTR_SECTION cmpdi r9,0 /* check segment size */ bne 3f -END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) - /* Calc va and put it in r29 */ - rldicr r29,r5,28,63-28 - rldicl r3,r3,0,36 - or r29,r3,r29 - - /* Calculate hash value for primary slot and store it in r28 */ - rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ - rldicl r0,r3,64-16,52 /* (ea >> 16) & 0xfff */ - xor r28,r5,r0 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) + or r29,r28,r29 + + /* Calculate hash value for primary slot and store it in r28 + * r3 = va, r5 = vsid + * r0 = (va >> 16) & ((1ul << (28 - 16)) -1) + */ + rldicl r0,r3,64-16,52 + xor r28,r5,r0 /* hash */ b 4f -3: /* Calc VA and hash in r29 and r28 for 1T segment */ - sldi r29,r5,40 /* vsid << 40 */ - clrldi r3,r3,24 /* ea & 0xffffffffff */ - rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */ - clrldi r5,r5,40 /* vsid & 0xffffff */ - rldicl r0,r3,64-16,40 /* (ea >> 16) & 0xffffff */ - xor r28,r28,r5 - or r29,r3,r29 /* VA */ +3: /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) + or r29,r28,r29 + /* + * calculate hash value for primary slot and + * store it in r28 for 1T segment + * r3 = va, r5 = vsid + */ + sldi r28,r5,25 /* vsid << 25 */ + /* r0 = (va >> 16) & ((1ul << (40 - 16)) -1) */ + rldicl r0,r3,64-16,40 + xor r28,r28,r5 /* vsid ^ ( vsid << 25) */ xor r28,r28,r0 /* hash */ /* Convert linux PTE bits into HW equivalents */ @@ -771,7 +809,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ andc r0,r30,r0 /* r0 = pte & ~r0 */ rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */ + /* + * Always add "C" bit for perf. Memory coherence is always enabled + */ + ori r3,r3,HPTE_R_C | HPTE_R_M /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) @@ -779,13 +820,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) BEGIN_FTR_SECTION mr r4,r30 mr r5,r7 - bl .hash_page_do_lazy_icache + bl hash_page_do_lazy_icache END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) /* At this point, r3 contains new PP bits, save them in * place of "access" in the param area (sic) */ - std r3,STK_PARM(r4)(r1) + std r3,STK_PARAM(R4)(r1) /* Get htab_hash_mask */ ld r4,htab_hash_mask@got(2) @@ -794,18 +835,21 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) /* Check if we may already be in the hashtable, in this case, we * go to out-of-line code to try to modify the HPTE */ - andi. r0,r31,_PAGE_HASHPTE + rldicl. r0,r31,64-12,48 bne ht64_modify_pte ht64_insert_pte: /* Clear hpte bits in new pte (we also clear BUSY btw) and - * add _PAGE_HASHPTE + * add _PAGE_HPTE_SUB0 */ lis r0,_PAGE_HPTEFLAGS@h ori r0,r0,_PAGE_HPTEFLAGS@l andc r30,r30,r0 +#ifdef CONFIG_PPC_64K_PAGES + oris r30,r30,_PAGE_HPTE_SUB0@h +#else ori r30,r30,_PAGE_HASHPTE - +#endif /* Phyical address in r5 */ rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT sldi r5,r5,PAGE_SHIFT @@ -815,12 +859,14 @@ ht64_insert_pte: rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ /* Call ppc_md.hpte_insert */ - ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ - mr r4,r29 /* Retreive va */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ li r7,0 /* !bolted, !secondary */ li r8,MMU_PAGE_64K - ld r9,STK_PARM(r9)(r1) /* segment size */ -_GLOBAL(ht64_call_hpte_insert1) + li r9,MMU_PAGE_64K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl ht64_call_hpte_insert1 +ht64_call_hpte_insert1: bl . /* patched by htab_finish_init() */ cmpdi 0,r3,0 bge ht64_pte_insert_ok /* Insertion successful */ @@ -838,12 +884,14 @@ _GLOBAL(ht64_call_hpte_insert1) rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ /* Call ppc_md.hpte_insert */ - ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ - mr r4,r29 /* Retreive va */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ li r7,HPTE_V_SECONDARY /* !bolted, secondary */ li r8,MMU_PAGE_64K - ld r9,STK_PARM(r9)(r1) /* segment size */ -_GLOBAL(ht64_call_hpte_insert2) + li r9,MMU_PAGE_64K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl ht64_call_hpte_insert2 +ht64_call_hpte_insert2: bl . /* patched by htab_finish_init() */ cmpdi 0,r3,0 bge+ ht64_pte_insert_ok /* Insertion successful */ @@ -860,7 +908,8 @@ _GLOBAL(ht64_call_hpte_insert2) 2: and r0,r5,r27 rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ /* Call ppc_md.hpte_remove */ -_GLOBAL(ht64_call_hpte_remove) +.globl ht64_call_hpte_remove +ht64_call_hpte_remove: bl . /* patched by htab_finish_init() */ /* Try all again */ @@ -878,15 +927,15 @@ ht64_pte_insert_ok: * (maybe add eieio may be good still ?) */ ht64_write_out_pte: - ld r6,STK_PARM(r6)(r1) + ld r6,STK_PARAM(R6)(r1) std r30,0(r6) li r3, 0 ht64_bail: - ld r27,STK_REG(r27)(r1) - ld r28,STK_REG(r28)(r1) - ld r29,STK_REG(r29)(r1) - ld r30,STK_REG(r30)(r1) - ld r31,STK_REG(r31)(r1) + ld r27,STK_REG(R27)(r1) + ld r28,STK_REG(R28)(r1) + ld r29,STK_REG(R29)(r1) + ld r30,STK_REG(R30)(r1) + ld r31,STK_REG(R31)(r1) addi r1,r1,STACKFRAMESIZE ld r0,16(r1) mtlr r0 @@ -909,11 +958,13 @@ ht64_modify_pte: add r3,r0,r3 /* add slot idx */ /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* va */ - li r6,MMU_PAGE_64K - ld r7,STK_PARM(r9)(r1) /* segment size */ - ld r8,STK_PARM(r8)(r1) /* get "local" param */ -_GLOBAL(ht64_call_hpte_updatepp) + mr r5,r29 /* vpn */ + li r6,MMU_PAGE_64K /* base page size */ + li r7,MMU_PAGE_64K /* actual page size */ + ld r8,STK_PARAM(R9)(r1) /* segment size */ + ld r9,STK_PARAM(R8)(r1) /* get "local" param */ +.globl ht64_call_hpte_updatepp +ht64_call_hpte_updatepp: bl . /* patched by htab_finish_init() */ /* if we failed because typically the HPTE wasn't really here @@ -935,7 +986,7 @@ ht64_wrong_access: ht64_pte_insert_failure: /* Bail out restoring old PTE */ - ld r6,STK_PARM(r6)(r1) + ld r6,STK_PARAM(R6)(r1) std r31,0(r6) li r3,-1 b ht64_bail diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 34e5c0b219b..cf1d325eae8 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -14,10 +14,10 @@ #include <linux/spinlock.h> #include <linux/bitops.h> +#include <linux/of.h> #include <linux/threads.h> #include <linux/smp.h> -#include <asm/abs_addr.h> #include <asm/machdep.h> #include <asm/mmu.h> #include <asm/mmu_context.h> @@ -27,6 +27,7 @@ #include <asm/cputable.h> #include <asm/udbg.h> #include <asm/kexec.h> +#include <asm/ppc-opcode.h> #ifdef DEBUG_LOW #define DBG_LOW(fmt...) udbg_printf(fmt) @@ -34,52 +35,110 @@ #define DBG_LOW(fmt...) #endif +#ifdef __BIG_ENDIAN__ #define HPTE_LOCK_BIT 3 +#else +#define HPTE_LOCK_BIT (56+3) +#endif -static DEFINE_SPINLOCK(native_tlbie_lock); +DEFINE_RAW_SPINLOCK(native_tlbie_lock); -static inline void __tlbie(unsigned long va, int psize, int ssize) +static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) { + unsigned long va; unsigned int penc; + unsigned long sllp; - /* clear top 16 bits, non SLS segment */ + /* + * We need 14 to 65 bits of va for a tlibe of 4K page + * With vpn we ignore the lower VPN_SHIFT bits already. + * And top two bits are already ignored because we can + * only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT + * of 12. + */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ va &= ~(0xffffULL << 48); switch (psize) { case MMU_PAGE_4K: - va &= ~0xffful; + /* clear out bits after (52) [0....52.....63] */ + va &= ~((1ul << (64 - 52)) - 1); va |= ssize << 8; - asm volatile("tlbie %0,0" : : "r" (va) : "memory"); + sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) | + ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4); + va |= sllp << 5; + asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) + : "memory"); break; default: - penc = mmu_psize_defs[psize].penc; - va &= ~((1ul << mmu_psize_defs[psize].shift) - 1); + /* We need 14 to 14 + i bits of va */ + penc = mmu_psize_defs[psize].penc[apsize]; + va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); va |= penc << 12; va |= ssize << 8; - asm volatile("tlbie %0,1" : : "r" (va) : "memory"); + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); /* AVAL */ + va |= 1; /* L */ + asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) + : "memory"); break; } } -static inline void __tlbiel(unsigned long va, int psize, int ssize) +static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) { + unsigned long va; unsigned int penc; + unsigned long sllp; - /* clear top 16 bits, non SLS segment */ + /* VPN_SHIFT can be atmost 12 */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64 bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ va &= ~(0xffffULL << 48); switch (psize) { case MMU_PAGE_4K: - va &= ~0xffful; + /* clear out bits after(52) [0....52.....63] */ + va &= ~((1ul << (64 - 52)) - 1); va |= ssize << 8; + sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) | + ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4); + va |= sllp << 5; asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)" : : "r"(va) : "memory"); break; default: - penc = mmu_psize_defs[psize].penc; - va &= ~((1ul << mmu_psize_defs[psize].shift) - 1); + /* We need 14 to 14 + i bits of va */ + penc = mmu_psize_defs[psize].penc[apsize]; + va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); va |= penc << 12; va |= ssize << 8; + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); + va |= 1; /* L */ asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)" : : "r"(va) : "memory"); break; @@ -87,33 +146,34 @@ static inline void __tlbiel(unsigned long va, int psize, int ssize) } -static inline void tlbie(unsigned long va, int psize, int ssize, int local) +static inline void tlbie(unsigned long vpn, int psize, int apsize, + int ssize, int local) { - unsigned int use_local = local && cpu_has_feature(CPU_FTR_TLBIEL); - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL); + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); if (use_local) use_local = mmu_psize_defs[psize].tlbiel; if (lock_tlbie && !use_local) - spin_lock(&native_tlbie_lock); + raw_spin_lock(&native_tlbie_lock); asm volatile("ptesync": : :"memory"); if (use_local) { - __tlbiel(va, psize, ssize); + __tlbiel(vpn, psize, apsize, ssize); asm volatile("ptesync": : :"memory"); } else { - __tlbie(va, psize, ssize); + __tlbie(vpn, psize, apsize, ssize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } if (lock_tlbie && !use_local) - spin_unlock(&native_tlbie_lock); + raw_spin_unlock(&native_tlbie_lock); } static inline void native_lock_hpte(struct hash_pte *hptep) { - unsigned long *word = &hptep->v; + unsigned long *word = (unsigned long *)&hptep->v; while (1) { - if (!test_and_set_bit(HPTE_LOCK_BIT, word)) + if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; while(test_bit(HPTE_LOCK_BIT, word)) cpu_relax(); @@ -122,31 +182,30 @@ static inline void native_lock_hpte(struct hash_pte *hptep) static inline void native_unlock_hpte(struct hash_pte *hptep) { - unsigned long *word = &hptep->v; + unsigned long *word = (unsigned long *)&hptep->v; - asm volatile("lwsync":::"memory"); - clear_bit(HPTE_LOCK_BIT, word); + clear_bit_unlock(HPTE_LOCK_BIT, word); } -static long native_hpte_insert(unsigned long hpte_group, unsigned long va, +static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, - unsigned long vflags, int psize, int ssize) + unsigned long vflags, int psize, int apsize, int ssize) { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; int i; if (!(vflags & HPTE_V_BOLTED)) { - DBG_LOW(" insert(group=%lx, va=%016lx, pa=%016lx," + DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", - hpte_group, va, pa, rflags, vflags, psize); + hpte_group, vpn, pa, rflags, vflags, psize); } for (i = 0; i < HPTES_PER_GROUP; i++) { - if (! (hptep->v & HPTE_V_VALID)) { + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) { /* retry with lock held */ native_lock_hpte(hptep); - if (! (hptep->v & HPTE_V_VALID)) + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) break; native_unlock_hpte(hptep); } @@ -157,22 +216,22 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long va, if (i == HPTES_PER_GROUP) return -1; - hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize) | rflags; + hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", i, hpte_v, hpte_r); } - hptep->r = hpte_r; + hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); /* * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ - hptep->v = hpte_v; + hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); @@ -193,12 +252,12 @@ static long native_hpte_remove(unsigned long hpte_group) for (i = 0; i < HPTES_PER_GROUP; i++) { hptep = htab_address + hpte_group + slot_offset; - hpte_v = hptep->v; + hpte_v = be64_to_cpu(hptep->v); if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { /* retry with lock held */ native_lock_hpte(hptep); - hpte_v = hptep->v; + hpte_v = be64_to_cpu(hptep->v); if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) break; @@ -219,41 +278,46 @@ static long native_hpte_remove(unsigned long hpte_group) } static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, int psize, int ssize, - int local) + unsigned long vpn, int bpsize, + int apsize, int ssize, int local) { struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0; - want_v = hpte_encode_v(va, psize, ssize); + want_v = hpte_encode_avpn(vpn, bpsize, ssize); - DBG_LOW(" update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)", - va, want_v & HPTE_V_AVPN, slot, newpp); + DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", + vpn, want_v & HPTE_V_AVPN, slot, newpp); native_lock_hpte(hptep); - hpte_v = hptep->v; - - /* Even if we miss, we need to invalidate the TLB */ + hpte_v = be64_to_cpu(hptep->v); + /* + * We need to invalidate the TLB always because hpte_remove doesn't do + * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less + * random entry from it. When we do that we don't invalidate the TLB + * (hpte_remove) because we assume the old translation is still + * technically "valid". + */ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) { DBG_LOW(" -> miss\n"); ret = -1; } else { DBG_LOW(" -> hit\n"); /* Update the HPTE */ - hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | - (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)); + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & ~(HPTE_R_PP | HPTE_R_N)) | + (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C))); } native_unlock_hpte(hptep); /* Ensure it is out of the tlb too. */ - tlbie(va, psize, ssize, local); + tlbie(vpn, bpsize, apsize, ssize, local); return ret; } -static long native_hpte_find(unsigned long va, int psize, int ssize) +static long native_hpte_find(unsigned long vpn, int psize, int ssize) { struct hash_pte *hptep; unsigned long hash; @@ -261,14 +325,14 @@ static long native_hpte_find(unsigned long va, int psize, int ssize) long slot; unsigned long want_v, hpte_v; - hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_v(va, psize, ssize); + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); /* Bolted mappings are only ever in the primary group */ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; for (i = 0; i < HPTES_PER_GROUP; i++) { hptep = htab_address + slot; - hpte_v = hptep->v; + hpte_v = be64_to_cpu(hptep->v); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) /* HPTE matches */ @@ -289,28 +353,32 @@ static long native_hpte_find(unsigned long va, int psize, int ssize) static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) { - unsigned long vsid, va; + unsigned long vpn; + unsigned long vsid; long slot; struct hash_pte *hptep; vsid = get_kernel_vsid(ea, ssize); - va = hpt_va(ea, vsid, ssize); + vpn = hpt_vpn(ea, vsid, ssize); - slot = native_hpte_find(va, psize, ssize); + slot = native_hpte_find(vpn, psize, ssize); if (slot == -1) panic("could not find page to bolt\n"); hptep = htab_address + slot; /* Update the HPTE */ - hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | - (newpp & (HPTE_R_PP | HPTE_R_N)); - - /* Ensure it is out of the tlb too. */ - tlbie(va, psize, ssize, 0); + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & + ~(HPTE_R_PP | HPTE_R_N)) | + (newpp & (HPTE_R_PP | HPTE_R_N))); + /* + * Ensure it is out of the tlb too. Bolted entries base and + * actual page size will be same. + */ + tlbie(vpn, psize, psize, ssize, 0); } -static void native_hpte_invalidate(unsigned long slot, unsigned long va, - int psize, int ssize, int local) +static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, + int bpsize, int apsize, int ssize, int local) { struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v; @@ -319,13 +387,19 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va, local_irq_save(flags); - DBG_LOW(" invalidate(va=%016lx, hash: %x)\n", va, slot); + DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); - want_v = hpte_encode_v(va, psize, ssize); + want_v = hpte_encode_avpn(vpn, bpsize, ssize); native_lock_hpte(hptep); - hpte_v = hptep->v; + hpte_v = be64_to_cpu(hptep->v); - /* Even if we miss, we need to invalidate the TLB */ + /* + * We need to invalidate the TLB always because hpte_remove doesn't do + * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less + * random entry from it. When we do that we don't invalidate the TLB + * (hpte_remove) because we assume the old translation is still + * technically "valid". + */ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); else @@ -333,73 +407,175 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va, hptep->v = 0; /* Invalidate the TLB */ - tlbie(va, psize, ssize, local); + tlbie(vpn, bpsize, apsize, ssize, local); + + local_irq_restore(flags); +} + +static void native_hugepage_invalidate(struct mm_struct *mm, + unsigned char *hpte_slot_array, + unsigned long addr, int psize) +{ + int ssize = 0, i; + int lock_tlbie; + struct hash_pte *hptep; + int actual_psize = MMU_PAGE_16M; + unsigned int max_hpte_count, valid; + unsigned long flags, s_addr = addr; + unsigned long hpte_v, want_v, shift; + unsigned long hidx, vpn = 0, vsid, hash, slot; + + shift = mmu_psize_defs[psize].shift; + max_hpte_count = 1U << (PMD_SHIFT - shift); + + local_irq_save(flags); + for (i = 0; i < max_hpte_count; i++) { + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + hptep = htab_address + slot; + want_v = hpte_encode_avpn(vpn, psize, ssize); + native_lock_hpte(hptep); + hpte_v = be64_to_cpu(hptep->v); + + /* Even if we miss, we need to invalidate the TLB */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + } + /* + * Since this is a hugepage, we just need a single tlbie. + * use the last vpn. + */ + lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + + asm volatile("ptesync":::"memory"); + __tlbie(vpn, psize, actual_psize, ssize); + asm volatile("eieio; tlbsync; ptesync":::"memory"); + + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); local_irq_restore(flags); } -#define LP_SHIFT 12 -#define LP_BITS 8 -#define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT) +static inline int __hpte_actual_psize(unsigned int lp, int psize) +{ + int i, shift; + unsigned int mask; + + /* start from 1 ignoring MMU_PAGE_4K */ + for (i = 1; i < MMU_PAGE_COUNT; i++) { + + /* invalid penc */ + if (mmu_psize_defs[psize].penc[i] == -1) + continue; + /* + * encoding bits per actual page size + * PTE LP actual page size + * rrrr rrrz >=8KB + * rrrr rrzz >=16KB + * rrrr rzzz >=32KB + * rrrr zzzz >=64KB + * ....... + */ + shift = mmu_psize_defs[i].shift - LP_SHIFT; + if (shift > LP_BITS) + shift = LP_BITS; + mask = (1 << shift) - 1; + if ((lp & mask) == mmu_psize_defs[psize].penc[i]) + return i; + } + return -1; +} static void hpte_decode(struct hash_pte *hpte, unsigned long slot, - int *psize, int *ssize, unsigned long *va) + int *psize, int *apsize, int *ssize, unsigned long *vpn) { - unsigned long hpte_r = hpte->r; - unsigned long hpte_v = hpte->v; - unsigned long avpn; - int i, size, shift, penc; - - if (!(hpte_v & HPTE_V_LARGE)) - size = MMU_PAGE_4K; - else { - for (i = 0; i < LP_BITS; i++) { - if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1)) - break; - } - penc = LP_MASK(i+1) >> LP_SHIFT; + unsigned long avpn, pteg, vpi; + unsigned long hpte_v = be64_to_cpu(hpte->v); + unsigned long hpte_r = be64_to_cpu(hpte->r); + unsigned long vsid, seg_off; + int size, a_size, shift; + /* Look at the 8 bit LP value */ + unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1); + + if (!(hpte_v & HPTE_V_LARGE)) { + size = MMU_PAGE_4K; + a_size = MMU_PAGE_4K; + } else { for (size = 0; size < MMU_PAGE_COUNT; size++) { - /* 4K pages are not represented by LP */ - if (size == MMU_PAGE_4K) - continue; - /* valid entries have a shift value */ if (!mmu_psize_defs[size].shift) continue; - if (penc == mmu_psize_defs[size].penc) + a_size = __hpte_actual_psize(lp, size); + if (a_size != -1) break; } } - /* This works for all page sizes, and for 256M and 1T segments */ + *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; shift = mmu_psize_defs[size].shift; - avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm) << 23; - if (shift < 23) { - unsigned long vpi, vsid, pteg; - - pteg = slot / HPTES_PER_GROUP; - if (hpte_v & HPTE_V_SECONDARY) - pteg = ~pteg; - switch (hpte_v >> HPTE_V_SSIZE_SHIFT) { - case MMU_SEGSIZE_256M: - vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask; - break; - case MMU_SEGSIZE_1T: - vsid = avpn >> 40; + avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); + pteg = slot / HPTES_PER_GROUP; + if (hpte_v & HPTE_V_SECONDARY) + pteg = ~pteg; + + switch (*ssize) { + case MMU_SEGSIZE_256M: + /* We only have 28 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1f) << 23; + vsid = avpn >> 5; + /* We can find more bits from the pteg value */ + if (shift < 23) { + vpi = (vsid ^ pteg) & htab_hash_mask; + seg_off |= vpi << shift; + } + *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; + case MMU_SEGSIZE_1T: + /* We only have 40 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1ffff) << 23; + vsid = avpn >> 17; + if (shift < 23) { vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask; - break; - default: - avpn = vpi = size = 0; + seg_off |= vpi << shift; } - avpn |= (vpi << mmu_psize_defs[size].shift); + *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; + default: + *vpn = size = 0; } - - *va = avpn; - *psize = size; - *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; + *psize = size; + *apsize = a_size; } /* @@ -412,11 +588,12 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, */ static void native_hpte_clear(void) { + unsigned long vpn = 0; unsigned long slot, slots, flags; struct hash_pte *hptep = htab_address; - unsigned long hpte_v, va; + unsigned long hpte_v; unsigned long pteg_count; - int psize, ssize; + int psize, apsize, ssize; pteg_count = htab_hash_mask + 1; @@ -425,7 +602,7 @@ static void native_hpte_clear(void) /* we take the tlbie lock and hold it. Some hardware will * deadlock if we try to tlbie from two processors at once. */ - spin_lock(&native_tlbie_lock); + raw_spin_lock(&native_tlbie_lock); slots = pteg_count * HPTES_PER_GROUP; @@ -435,21 +612,21 @@ static void native_hpte_clear(void) * running, right? and for crash dump, we probably * don't want to wait for a maybe bad cpu. */ - hpte_v = hptep->v; + hpte_v = be64_to_cpu(hptep->v); /* * Call __tlbie() here rather than tlbie() since we * already hold the native_tlbie_lock. */ if (hpte_v & HPTE_V_VALID) { - hpte_decode(hptep, slot, &psize, &ssize, &va); + hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn); hptep->v = 0; - __tlbie(va, psize, ssize); + __tlbie(vpn, psize, apsize, ssize); } } asm volatile("eieio; tlbsync; ptesync":::"memory"); - spin_unlock(&native_tlbie_lock); + raw_spin_unlock(&native_tlbie_lock); local_irq_restore(flags); } @@ -459,7 +636,8 @@ static void native_hpte_clear(void) */ static void native_flush_hash_range(unsigned long number, int local) { - unsigned long va, hash, index, hidx, shift, slot; + unsigned long vpn; + unsigned long hash, index, hidx, shift, slot; struct hash_pte *hptep; unsigned long hpte_v; unsigned long want_v; @@ -473,20 +651,20 @@ static void native_flush_hash_range(unsigned long number, int local) local_irq_save(flags); for (i = 0; i < number; i++) { - va = batch->vaddr[i]; + vpn = batch->vpn[i]; pte = batch->pte[i]; - pte_iterate_hashed_subpages(pte, psize, va, index, shift) { - hash = hpt_hash(va, shift, ssize); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); hidx = __rpte_to_hidx(pte, index); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; hptep = htab_address + slot; - want_v = hpte_encode_v(va, psize, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); native_lock_hpte(hptep); - hpte_v = hptep->v; + hpte_v = be64_to_cpu(hptep->v); if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); @@ -495,67 +673,44 @@ static void native_flush_hash_range(unsigned long number, int local) } pte_iterate_hashed_end(); } - if (cpu_has_feature(CPU_FTR_TLBIEL) && + if (mmu_has_feature(MMU_FTR_TLBIEL) && mmu_psize_defs[psize].tlbiel && local) { asm volatile("ptesync":::"memory"); for (i = 0; i < number; i++) { - va = batch->vaddr[i]; + vpn = batch->vpn[i]; pte = batch->pte[i]; - pte_iterate_hashed_subpages(pte, psize, va, index, - shift) { - __tlbiel(va, psize, ssize); + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbiel(vpn, psize, psize, ssize); } pte_iterate_hashed_end(); } asm volatile("ptesync":::"memory"); } else { - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); if (lock_tlbie) - spin_lock(&native_tlbie_lock); + raw_spin_lock(&native_tlbie_lock); asm volatile("ptesync":::"memory"); for (i = 0; i < number; i++) { - va = batch->vaddr[i]; + vpn = batch->vpn[i]; pte = batch->pte[i]; - pte_iterate_hashed_subpages(pte, psize, va, index, - shift) { - __tlbie(va, psize, ssize); + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbie(vpn, psize, psize, ssize); } pte_iterate_hashed_end(); } asm volatile("eieio; tlbsync; ptesync":::"memory"); if (lock_tlbie) - spin_unlock(&native_tlbie_lock); + raw_spin_unlock(&native_tlbie_lock); } local_irq_restore(flags); } -#ifdef CONFIG_PPC_PSERIES -/* Disable TLB batching on nighthawk */ -static inline int tlb_batching_enabled(void) -{ - struct device_node *root = of_find_node_by_path("/"); - int enabled = 1; - - if (root) { - const char *model = of_get_property(root, "model", NULL); - if (model && !strcmp(model, "IBM,9076-N81")) - enabled = 0; - of_node_put(root); - } - - return enabled; -} -#else -static inline int tlb_batching_enabled(void) -{ - return 1; -} -#endif - void __init hpte_init_native(void) { ppc_md.hpte_invalidate = native_hpte_invalidate; @@ -564,6 +719,6 @@ void __init hpte_init_native(void) ppc_md.hpte_insert = native_hpte_insert; ppc_md.hpte_remove = native_hpte_remove; ppc_md.hpte_clear_all = native_hpte_clear; - if (tlb_batching_enabled()) - ppc_md.flush_hash_range = native_flush_hash_range; + ppc_md.flush_hash_range = native_flush_hash_range; + ppc_md.hugepage_invalidate = native_hugepage_invalidate; } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index a83dfa3cf40..88fdd9d2507 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -27,10 +27,13 @@ #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/sysctl.h> +#include <linux/export.h> #include <linux/ctype.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/signal.h> +#include <linux/memblock.h> +#include <linux/context_tracking.h> #include <asm/processor.h> #include <asm/pgtable.h> @@ -38,11 +41,9 @@ #include <asm/mmu_context.h> #include <asm/page.h> #include <asm/types.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/machdep.h> -#include <asm/lmb.h> -#include <asm/abs_addr.h> +#include <asm/prom.h> #include <asm/tlbflush.h> #include <asm/io.h> #include <asm/eeh.h> @@ -52,6 +53,10 @@ #include <asm/sections.h> #include <asm/spu.h> #include <asm/udbg.h> +#include <asm/code-patching.h> +#include <asm/fadump.h> +#include <asm/firmware.h> +#include <asm/tm.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -67,6 +72,7 @@ #define KB (1024) #define MB (1024*KB) +#define GB (1024L*MB) /* * Note: pte --> Linux PTE @@ -90,17 +96,18 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; struct hash_pte *htab_address; unsigned long htab_size_bytes; unsigned long htab_hash_mask; +EXPORT_SYMBOL_GPL(htab_hash_mask); int mmu_linear_psize = MMU_PAGE_4K; int mmu_virtual_psize = MMU_PAGE_4K; int mmu_vmalloc_psize = MMU_PAGE_4K; +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif int mmu_io_psize = MMU_PAGE_4K; int mmu_kernel_ssize = MMU_SEGSIZE_256M; int mmu_highuser_ssize = MMU_SEGSIZE_256M; u16 mmu_slb_size = 64; -#ifdef CONFIG_HUGETLB_PAGE -int mmu_huge_psize = MMU_PAGE_16M; -unsigned int HPAGE_SHIFT; -#endif +EXPORT_SYMBOL_GPL(mmu_slb_size); #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif @@ -116,11 +123,11 @@ static DEFINE_SPINLOCK(linear_map_hash_lock); /* Pre-POWER4 CPUs (4k pages only) */ -struct mmu_psize_def mmu_psize_defaults_old[] = { +static struct mmu_psize_def mmu_psize_defaults_old[] = { [MMU_PAGE_4K] = { .shift = 12, .sllp = 0, - .penc = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, .avpnm = 0, .tlbiel = 0, }, @@ -130,56 +137,100 @@ struct mmu_psize_def mmu_psize_defaults_old[] = { * * Support for 16Mb large pages */ -struct mmu_psize_def mmu_psize_defaults_gp[] = { +static struct mmu_psize_def mmu_psize_defaults_gp[] = { [MMU_PAGE_4K] = { .shift = 12, .sllp = 0, - .penc = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, .avpnm = 0, .tlbiel = 1, }, [MMU_PAGE_16M] = { .shift = 24, .sllp = SLB_VSID_L, - .penc = 0, + .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, + [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, .avpnm = 0x1UL, .tlbiel = 0, }, }; +static unsigned long htab_convert_pte_flags(unsigned long pteflags) +{ + unsigned long rflags = pteflags & 0x1fa; + + /* _PAGE_EXEC -> NOEXEC */ + if ((pteflags & _PAGE_EXEC) == 0) + rflags |= HPTE_R_N; + + /* PP bits. PAGE_USER is already PP bit 0x2, so we only + * need to add in 0x1 if it's a read-only user page + */ + if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) && + (pteflags & _PAGE_DIRTY))) + rflags |= 1; + /* + * Always add "C" bit for perf. Memory coherence is always enabled + */ + return rflags | HPTE_R_C | HPTE_R_M; +} int htab_bolt_mapping(unsigned long vstart, unsigned long vend, - unsigned long pstart, unsigned long mode, + unsigned long pstart, unsigned long prot, int psize, int ssize) { unsigned long vaddr, paddr; unsigned int step, shift; - unsigned long tmp_mode; int ret = 0; shift = mmu_psize_defs[psize].shift; step = 1 << shift; + prot = htab_convert_pte_flags(prot); + + DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", + vstart, vend, pstart, prot, psize, ssize); + for (vaddr = vstart, paddr = pstart; vaddr < vend; vaddr += step, paddr += step) { unsigned long hash, hpteg; unsigned long vsid = get_kernel_vsid(vaddr, ssize); - unsigned long va = hpt_va(vaddr, vsid, ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); + unsigned long tprot = prot; - tmp_mode = mode; - - /* Make non-kernel text non-executable */ - if (!in_kernel_text(vaddr)) - tmp_mode = mode | HPTE_R_N; + /* + * If we hit a bad address return error. + */ + if (!vsid) + return -1; + /* Make kernel text executable */ + if (overlaps_kernel_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + /* Make kvm guest trampolines executable */ + if (overlaps_kvm_tmp(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + /* + * If relocatable, check if it overlaps interrupt vectors that + * are copied down to real 0. For relocatable kernel + * (e.g. kdump case) we copy interrupt vectors down to real + * address 0. Mark that region as executable. This is + * because on p8 system with relocation on exception feature + * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence + * in order to execute the interrupt handlers in virtual + * mode the vector region need to be marked as executable. + */ + if ((PHYSICAL_START > MEMORY_START) && + overlaps_interrupt_vector_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; - hash = hpt_hash(va, shift, ssize); + hash = hpt_hash(vpn, shift, ssize); hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); - DBG("htab_bolt_mapping: calling %p\n", ppc_md.hpte_insert); - BUG_ON(!ppc_md.hpte_insert); - ret = ppc_md.hpte_insert(hpteg, va, paddr, - tmp_mode, HPTE_V_BOLTED, psize, ssize); + ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot, + HPTE_V_BOLTED, psize, psize, ssize); if (ret < 0) break; @@ -191,30 +242,52 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, return ret < 0 ? ret : 0; } +#ifdef CONFIG_MEMORY_HOTPLUG +static int htab_remove_mapping(unsigned long vstart, unsigned long vend, + int psize, int ssize) +{ + unsigned long vaddr; + unsigned int step, shift; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + if (!ppc_md.hpte_removebolted) { + printk(KERN_WARNING "Platform doesn't implement " + "hpte_removebolted\n"); + return -EINVAL; + } + + for (vaddr = vstart; vaddr < vend; vaddr += step) + ppc_md.hpte_removebolted(vaddr, psize, ssize); + + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + static int __init htab_dt_scan_seg_sizes(unsigned long node, const char *uname, int depth, void *data) { - char *type = of_get_flat_dt_prop(node, "device_type", NULL); - u32 *prop; - unsigned long size = 0; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; /* We are scanning "cpu" nodes only */ if (type == NULL || strcmp(type, "cpu") != 0) return 0; - prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", - &size); + prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size); if (prop == NULL) return 0; for (; size >= 4; size -= 4, ++prop) { - if (prop[0] == 40) { + if (be32_to_cpu(prop[0]) == 40) { DBG("1T segment support detected\n"); - cur_cpu_spec->cpu_features |= CPU_FTR_1T_SEGMENT; + cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; return 1; } } - cur_cpu_spec->cpu_features &= ~CPU_FTR_NO_SLBIE_B; + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; return 0; } @@ -223,89 +296,184 @@ static void __init htab_init_seg_sizes(void) of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); } +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x14: + idx = MMU_PAGE_1M; + break; + case 0x18: + idx = MMU_PAGE_16M; + break; + case 0x22: + idx = MMU_PAGE_16G; + break; + } + return idx; +} + static int __init htab_dt_scan_page_sizes(unsigned long node, const char *uname, int depth, void *data) { - char *type = of_get_flat_dt_prop(node, "device_type", NULL); - u32 *prop; - unsigned long size = 0; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; /* We are scanning "cpu" nodes only */ if (type == NULL || strcmp(type, "cpu") != 0) return 0; - prop = (u32 *)of_get_flat_dt_prop(node, - "ibm,segment-page-sizes", &size); + prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); if (prop != NULL) { - DBG("Page sizes from device-tree:\n"); + pr_info("Page sizes from device-tree:\n"); size /= 4; - cur_cpu_spec->cpu_features &= ~(CPU_FTR_16M_PAGE); + cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); while(size > 0) { - unsigned int shift = prop[0]; - unsigned int slbenc = prop[1]; - unsigned int lpnum = prop[2]; - unsigned int lpenc = 0; + unsigned int base_shift = be32_to_cpu(prop[0]); + unsigned int slbenc = be32_to_cpu(prop[1]); + unsigned int lpnum = be32_to_cpu(prop[2]); struct mmu_psize_def *def; - int idx = -1; + int idx, base_idx; size -= 3; prop += 3; - while(size > 0 && lpnum) { - if (prop[0] == shift) - lpenc = prop[1]; - prop += 2; size -= 2; - lpnum--; - } - switch(shift) { - case 0xc: - idx = MMU_PAGE_4K; - break; - case 0x10: - idx = MMU_PAGE_64K; - break; - case 0x14: - idx = MMU_PAGE_1M; - break; - case 0x18: - idx = MMU_PAGE_16M; - cur_cpu_spec->cpu_features |= CPU_FTR_16M_PAGE; - break; - case 0x22: - idx = MMU_PAGE_16G; - break; - } - if (idx < 0) + base_idx = get_idx_from_shift(base_shift); + if (base_idx < 0) { + /* + * skip the pte encoding also + */ + prop += lpnum * 2; size -= lpnum * 2; continue; - def = &mmu_psize_defs[idx]; - def->shift = shift; - if (shift <= 23) + } + def = &mmu_psize_defs[base_idx]; + if (base_idx == MMU_PAGE_16M) + cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; + + def->shift = base_shift; + if (base_shift <= 23) def->avpnm = 0; else - def->avpnm = (1 << (shift - 23)) - 1; + def->avpnm = (1 << (base_shift - 23)) - 1; def->sllp = slbenc; - def->penc = lpenc; - /* We don't know for sure what's up with tlbiel, so + /* + * We don't know for sure what's up with tlbiel, so * for now we only set it for 4K and 64K pages */ - if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K) + if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) def->tlbiel = 1; else def->tlbiel = 0; - DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, " - "tlbiel=%d, penc=%d\n", - idx, shift, def->sllp, def->avpnm, def->tlbiel, - def->penc); + while (size > 0 && lpnum) { + unsigned int shift = be32_to_cpu(prop[0]); + int penc = be32_to_cpu(prop[1]); + + prop += 2; size -= 2; + lpnum--; + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + if (penc == -1) + pr_err("Invalid penc for base_shift=%d " + "shift=%d\n", base_shift, shift); + + def->penc[idx] = penc; + pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," + " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", + base_shift, shift, def->sllp, + def->avpnm, def->tlbiel, def->penc[idx]); + } } return 1; } return 0; } +#ifdef CONFIG_HUGETLB_PAGE +/* Scan for 16G memory blocks that have been set aside for huge pages + * and reserve those blocks for 16G huge pages. + */ +static int __init htab_dt_scan_hugepage_blocks(unsigned long node, + const char *uname, int depth, + void *data) { + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be64 *addr_prop; + const __be32 *page_count_prop; + unsigned int expected_pages; + long unsigned int phys_addr; + long unsigned int block_size; + + /* We are scanning "memory" nodes only */ + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + /* This property is the log base 2 of the number of virtual pages that + * will represent this memory block. */ + page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); + if (page_count_prop == NULL) + return 0; + expected_pages = (1 << be32_to_cpu(page_count_prop[0])); + addr_prop = of_get_flat_dt_prop(node, "reg", NULL); + if (addr_prop == NULL) + return 0; + phys_addr = be64_to_cpu(addr_prop[0]); + block_size = be64_to_cpu(addr_prop[1]); + if (block_size != (16 * GB)) + return 0; + printk(KERN_INFO "Huge page(16GB) memory: " + "addr = 0x%lX size = 0x%lX pages = %d\n", + phys_addr, block_size, expected_pages); + if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) { + memblock_reserve(phys_addr, block_size * expected_pages); + add_gpage(phys_addr, block_size, expected_pages); + } + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static void mmu_psize_set_default_penc(void) +{ + int bpsize, apsize; + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) + for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) + mmu_psize_defs[bpsize].penc[apsize] = -1; +} + +#ifdef CONFIG_PPC_64K_PAGES + +static bool might_have_hea(void) +{ + /* + * The HEA ethernet adapter requires awareness of the + * GX bus. Without that awareness we can easily assume + * we will never see an HEA ethernet device. + */ +#ifdef CONFIG_IBMEBUS + return !cpu_has_feature(CPU_FTR_ARCH_207S); +#else + return false; +#endif +} + +#endif /* #ifdef CONFIG_PPC_64K_PAGES */ + static void __init htab_init_page_sizes(void) { int rc; + /* se the invalid penc to -1 */ + mmu_psize_set_default_penc(); + /* Default to 4K pages only */ memcpy(mmu_psize_defs, mmu_psize_defaults_old, sizeof(mmu_psize_defaults_old)); @@ -321,7 +489,7 @@ static void __init htab_init_page_sizes(void) * Not in the device-tree, let's fallback on known size * list for 16M capable GP & GR */ - if (cpu_has_feature(CPU_FTR_16M_PAGE)) + if (mmu_has_feature(MMU_FTR_16M_PAGE)) memcpy(mmu_psize_defs, mmu_psize_defaults_gp, sizeof(mmu_psize_defaults_gp)); found: @@ -351,34 +519,49 @@ static void __init htab_init_page_sizes(void) mmu_vmalloc_psize = MMU_PAGE_64K; if (mmu_linear_psize == MMU_PAGE_4K) mmu_linear_psize = MMU_PAGE_64K; - if (cpu_has_feature(CPU_FTR_CI_LARGE_PAGE)) { + if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { /* - * Don't use 64k pages for ioremap on pSeries, since - * that would stop us accessing the HEA ethernet. + * When running on pSeries using 64k pages for ioremap + * would stop us accessing the HEA ethernet. So if we + * have the chance of ever seeing one, stay at 4k. */ - if (!machine_is(pseries)) + if (!might_have_hea() || !machine_is(pseries)) mmu_io_psize = MMU_PAGE_64K; } else mmu_ci_restrictions = 1; } #endif /* CONFIG_PPC_64K_PAGES */ +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* We try to use 16M pages for vmemmap if that is supported + * and we have at least 1G of RAM at boot + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift && + memblock_phys_mem_size() >= 0x40000000) + mmu_vmemmap_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_64K].shift) + mmu_vmemmap_psize = MMU_PAGE_64K; + else + mmu_vmemmap_psize = MMU_PAGE_4K; +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + printk(KERN_DEBUG "Page orders: linear mapping = %d, " - "virtual = %d, io = %d\n", + "virtual = %d, io = %d" +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ", vmemmap = %d" +#endif + "\n", mmu_psize_defs[mmu_linear_psize].shift, mmu_psize_defs[mmu_virtual_psize].shift, - mmu_psize_defs[mmu_io_psize].shift); + mmu_psize_defs[mmu_io_psize].shift +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ,mmu_psize_defs[mmu_vmemmap_psize].shift +#endif + ); #ifdef CONFIG_HUGETLB_PAGE - /* Init large page size. Currently, we pick 16M or 1M depending - * on what is available - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - set_huge_psize(MMU_PAGE_16M); - /* With 4k/4level pagetables, we can't (for now) cope with a - * huge page size < PMD_SIZE */ - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - set_huge_psize(MMU_PAGE_1M); + /* Reserve 16G huge page memory sections for huge pages */ + of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); #endif /* CONFIG_HUGETLB_PAGE */ } @@ -386,17 +569,17 @@ static int __init htab_dt_scan_pftsize(unsigned long node, const char *uname, int depth, void *data) { - char *type = of_get_flat_dt_prop(node, "device_type", NULL); - u32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; /* We are scanning "cpu" nodes only */ if (type == NULL || strcmp(type, "cpu") != 0) return 0; - prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL); + prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL); if (prop != NULL) { /* pft_size[0] is the NUMA CEC cookie */ - ppc64_pft_size = prop[1]; + ppc64_pft_size = be32_to_cpu(prop[1]); return 1; } return 0; @@ -404,7 +587,7 @@ static int __init htab_dt_scan_pftsize(unsigned long node, static unsigned long __init htab_get_table_size(void) { - unsigned long mem_size, rnd_mem_size, pteg_count; + unsigned long mem_size, rnd_mem_size, pteg_count, psize; /* If hash size isn't already provided by the platform, we try to * retrieve it from the device-tree. If it's not there neither, we @@ -416,70 +599,80 @@ static unsigned long __init htab_get_table_size(void) return 1UL << ppc64_pft_size; /* round mem_size up to next power of 2 */ - mem_size = lmb_phys_mem_size(); + mem_size = memblock_phys_mem_size(); rnd_mem_size = 1UL << __ilog2(mem_size); if (rnd_mem_size < mem_size) rnd_mem_size <<= 1; /* # pages / 2 */ - pteg_count = max(rnd_mem_size >> (12 + 1), 1UL << 11); + psize = mmu_psize_defs[mmu_virtual_psize].shift; + pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11); return pteg_count << 7; } #ifdef CONFIG_MEMORY_HOTPLUG -void create_section_mapping(unsigned long start, unsigned long end) +int create_section_mapping(unsigned long start, unsigned long end) { - BUG_ON(htab_bolt_mapping(start, end, __pa(start), - _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX, - mmu_linear_psize, mmu_kernel_ssize)); + return htab_bolt_mapping(start, end, __pa(start), + pgprot_val(PAGE_KERNEL), mmu_linear_psize, + mmu_kernel_ssize); } -#endif /* CONFIG_MEMORY_HOTPLUG */ -static inline void make_bl(unsigned int *insn_addr, void *func) +int remove_section_mapping(unsigned long start, unsigned long end) { - unsigned long funcp = *((unsigned long *)func); - int offset = funcp - (unsigned long)insn_addr; - - *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc)); - flush_icache_range((unsigned long)insn_addr, 4+ - (unsigned long)insn_addr); + return htab_remove_mapping(start, end, mmu_linear_psize, + mmu_kernel_ssize); } +#endif /* CONFIG_MEMORY_HOTPLUG */ + +extern u32 htab_call_hpte_insert1[]; +extern u32 htab_call_hpte_insert2[]; +extern u32 htab_call_hpte_remove[]; +extern u32 htab_call_hpte_updatepp[]; +extern u32 ht64_call_hpte_insert1[]; +extern u32 ht64_call_hpte_insert2[]; +extern u32 ht64_call_hpte_remove[]; +extern u32 ht64_call_hpte_updatepp[]; static void __init htab_finish_init(void) { - extern unsigned int *htab_call_hpte_insert1; - extern unsigned int *htab_call_hpte_insert2; - extern unsigned int *htab_call_hpte_remove; - extern unsigned int *htab_call_hpte_updatepp; - #ifdef CONFIG_PPC_HAS_HASH_64K - extern unsigned int *ht64_call_hpte_insert1; - extern unsigned int *ht64_call_hpte_insert2; - extern unsigned int *ht64_call_hpte_remove; - extern unsigned int *ht64_call_hpte_updatepp; - - make_bl(ht64_call_hpte_insert1, ppc_md.hpte_insert); - make_bl(ht64_call_hpte_insert2, ppc_md.hpte_insert); - make_bl(ht64_call_hpte_remove, ppc_md.hpte_remove); - make_bl(ht64_call_hpte_updatepp, ppc_md.hpte_updatepp); + patch_branch(ht64_call_hpte_insert1, + ppc_function_entry(ppc_md.hpte_insert), + BRANCH_SET_LINK); + patch_branch(ht64_call_hpte_insert2, + ppc_function_entry(ppc_md.hpte_insert), + BRANCH_SET_LINK); + patch_branch(ht64_call_hpte_remove, + ppc_function_entry(ppc_md.hpte_remove), + BRANCH_SET_LINK); + patch_branch(ht64_call_hpte_updatepp, + ppc_function_entry(ppc_md.hpte_updatepp), + BRANCH_SET_LINK); #endif /* CONFIG_PPC_HAS_HASH_64K */ - make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert); - make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert); - make_bl(htab_call_hpte_remove, ppc_md.hpte_remove); - make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp); + patch_branch(htab_call_hpte_insert1, + ppc_function_entry(ppc_md.hpte_insert), + BRANCH_SET_LINK); + patch_branch(htab_call_hpte_insert2, + ppc_function_entry(ppc_md.hpte_insert), + BRANCH_SET_LINK); + patch_branch(htab_call_hpte_remove, + ppc_function_entry(ppc_md.hpte_remove), + BRANCH_SET_LINK); + patch_branch(htab_call_hpte_updatepp, + ppc_function_entry(ppc_md.hpte_updatepp), + BRANCH_SET_LINK); } -void __init htab_initialize(void) +static void __init htab_initialize(void) { unsigned long table; unsigned long pteg_count; - unsigned long mode_rw; + unsigned long prot; unsigned long base = 0, size = 0, limit; - int i; - - extern unsigned long tce_alloc_start, tce_alloc_end; + struct memblock_region *reg; DBG(" -> htab_initialize()\n"); @@ -489,7 +682,7 @@ void __init htab_initialize(void) /* Initialize page sizes */ htab_init_page_sizes(); - if (cpu_has_feature(CPU_FTR_1T_SEGMENT)) { + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { mmu_kernel_ssize = MMU_SEGSIZE_1T; mmu_highuser_ssize = MMU_SEGSIZE_1T; printk(KERN_INFO "Using 1TB segments\n"); @@ -508,6 +701,16 @@ void __init htab_initialize(void) /* Using a hypervisor which owns the htab */ htab_address = NULL; _SDR1 = 0; +#ifdef CONFIG_FA_DUMP + /* + * If firmware assisted dump is active firmware preserves + * the contents of htab along with entire partition memory. + * Clear the htab if firmware assisted dump is active so + * that we dont end up using old mappings. + */ + if (is_fadump_active() && ppc_md.hpte_clear_all) + ppc_md.hpte_clear_all(); +#endif } else { /* Find storage for the HPT. Must be contiguous in * the absolute address space. On cell we want it to be @@ -516,14 +719,14 @@ void __init htab_initialize(void) if (machine_is(cell)) limit = 0x80000000; else - limit = 0; + limit = MEMBLOCK_ALLOC_ANYWHERE; - table = lmb_alloc_base(htab_size_bytes, htab_size_bytes, limit); + table = memblock_alloc_base(htab_size_bytes, htab_size_bytes, limit); DBG("Hash table allocated at %lx, size: %lx\n", table, htab_size_bytes); - htab_address = abs_to_virt(table); + htab_address = __va(table); /* htab absolute addr + encoded htabsize */ _SDR1 = table + __ilog2(pteg_count) - 11; @@ -535,12 +738,12 @@ void __init htab_initialize(void) mtspr(SPRN_SDR1, _SDR1); } - mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX; + prot = pgprot_val(PAGE_KERNEL); #ifdef CONFIG_DEBUG_PAGEALLOC - linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT; - linear_map_hash_slots = __va(lmb_alloc_base(linear_map_hash_count, - 1, lmb.rmo_size)); + linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + linear_map_hash_slots = __va(memblock_alloc_base(linear_map_hash_count, + 1, ppc64_rma_size)); memset(linear_map_hash_slots, 0, linear_map_hash_count); #endif /* CONFIG_DEBUG_PAGEALLOC */ @@ -550,15 +753,16 @@ void __init htab_initialize(void) */ /* create bolted the linear mapping in the hash table */ - for (i=0; i < lmb.memory.cnt; i++) { - base = (unsigned long)__va(lmb.memory.region[i].base); - size = lmb.memory.region[i].size; + for_each_memblock(memory, reg) { + base = (unsigned long)__va(reg->base); + size = reg->size; - DBG("creating mapping for region: %lx : %lx\n", base, size); + DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", + base, size, prot); #ifdef CONFIG_U3_DART /* Do not map the DART space. Fortunately, it will be aligned - * in such a way that it will not cross two lmb regions and + * in such a way that it will not cross two memblock regions and * will fit within a single 16Mb page. * The DART space is assumed to be a full 16Mb region even if * we only use 2Mb of that space. We will use more of it later @@ -571,22 +775,23 @@ void __init htab_initialize(void) unsigned long dart_table_end = dart_tablebase + 16 * MB; if (base != dart_tablebase) BUG_ON(htab_bolt_mapping(base, dart_tablebase, - __pa(base), mode_rw, + __pa(base), prot, mmu_linear_psize, mmu_kernel_ssize)); if ((base + size) > dart_table_end) BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB, base + size, __pa(dart_table_end), - mode_rw, + prot, mmu_linear_psize, mmu_kernel_ssize)); continue; } #endif /* CONFIG_U3_DART */ BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), - mode_rw, mmu_linear_psize, mmu_kernel_ssize)); - } + prot, mmu_linear_psize, mmu_kernel_ssize)); + } + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); /* * If we have a memory_limit and we've allocated TCEs then we need to @@ -603,7 +808,7 @@ void __init htab_initialize(void) tce_alloc_start = base + size + 1; BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, - __pa(tce_alloc_start), mode_rw, + __pa(tce_alloc_start), prot, mmu_linear_psize, mmu_kernel_ssize)); } @@ -614,11 +819,41 @@ void __init htab_initialize(void) #undef KB #undef MB -void htab_initialize_secondary(void) +void __init early_init_mmu(void) { + /* Setup initial STAB address in the PACA */ + get_paca()->stab_real = __pa((u64)&initial_stab); + get_paca()->stab_addr = (u64)&initial_stab; + + /* Initialize the MMU Hash table and create the linear mapping + * of memory. Has to be done before stab/slb initialization as + * this is currently where the page size encoding is obtained + */ + htab_initialize(); + + /* Initialize stab / SLB management */ + if (mmu_has_feature(MMU_FTR_SLB)) + slb_initialize(); + else + stab_initialize(get_paca()->stab_real); +} + +#ifdef CONFIG_SMP +void early_init_mmu_secondary(void) +{ + /* Initialize hash table for that CPU */ if (!firmware_has_feature(FW_FEATURE_LPAR)) mtspr(SPRN_SDR1, _SDR1); + + /* Initialize STAB/SLB. We use a virtual address as it works + * in real mode on pSeries. + */ + if (mmu_has_feature(MMU_FTR_SLB)) + slb_initialize(); + else + stab_initialize(get_paca()->stab_addr); } +#endif /* CONFIG_SMP */ /* * Called by asm hashtable.S for doing lazy icache flush @@ -635,7 +870,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) /* page is dirty */ if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { if (trap == 0x400) { - __flush_dcache_icache(page_address(page)); + flush_dcache_icache_page(page); set_bit(PG_arch_1, &page->flags); } else pp |= HPTE_R_N; @@ -643,6 +878,31 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) return pp; } +#ifdef CONFIG_PPC_MM_SLICES +unsigned int get_paca_psize(unsigned long addr) +{ + u64 lpsizes; + unsigned char *hpsizes; + unsigned long index, mask_index; + + if (addr < SLICE_LOW_TOP) { + lpsizes = get_paca()->context.low_slices_psize; + index = GET_LOW_SLICE_INDEX(addr); + return (lpsizes >> (index * 4)) & 0xF; + } + hpsizes = get_paca()->context.high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + mask_index = index & 0x1; + return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF; +} + +#else +unsigned int get_paca_psize(unsigned long addr) +{ + return get_paca()->context.user_psize; +} +#endif + /* * Demote a segment to using 4k pages. * For now this makes the whole process use 4k pages. @@ -650,13 +910,13 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) #ifdef CONFIG_PPC_64K_PAGES void demote_segment_4k(struct mm_struct *mm, unsigned long addr) { - if (mm->context.user_psize == MMU_PAGE_4K) + if (get_slice_psize(mm, addr) == MMU_PAGE_4K) return; - slice_set_user_psize(mm, MMU_PAGE_4K); + slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); #ifdef CONFIG_SPU_BASE spu_flush_all_slbs(mm); #endif - if (get_paca()->context.user_psize != MMU_PAGE_4K) { + if (get_paca_psize(addr) != MMU_PAGE_4K) { get_paca()->context = mm->context; slb_flush_and_rebolt(); } @@ -671,15 +931,15 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) * Result is 0: full permissions, _PAGE_RW: read-only, * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. */ -static int subpage_protection(pgd_t *pgdir, unsigned long ea) +static int subpage_protection(struct mm_struct *mm, unsigned long ea) { - struct subpage_prot_table *spt = pgd_subpage_prot(pgdir); + struct subpage_prot_table *spt = &mm->context.spt; u32 spp = 0; u32 **sbpm, *sbpp; if (ea >= spt->maxaddr) return 0; - if (ea < 0x100000000) { + if (ea < 0x100000000UL) { /* addresses below 4GB use spt->low_prot */ sbpm = spt->low_prot; } else { @@ -701,12 +961,40 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea) } #else /* CONFIG_PPC_SUBPAGE_PROT */ -static inline int subpage_protection(pgd_t *pgdir, unsigned long ea) +static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) { return 0; } #endif +void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, int lpsize, unsigned long pte) +{ + if (!printk_ratelimit()) + return; + pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", + ea, access, current->comm); + pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", + trap, vsid, ssize, psize, lpsize, pte); +} + +static void check_paca_psize(unsigned long ea, struct mm_struct *mm, + int psize, bool user_region) +{ + if (user_region) { + if (psize != get_paca_psize(ea)) { + get_paca()->context = mm->context; + slb_flush_and_rebolt(); + } + } else if (get_paca()->vmalloc_sllp != + mmu_psize_defs[mmu_vmalloc_psize].sllp) { + get_paca()->vmalloc_sllp = + mmu_psize_defs[mmu_vmalloc_psize].sllp; + slb_vmalloc_update(); + } +} + /* Result code is: * 0 - handled * 1 - normal page fault @@ -715,22 +1003,19 @@ static inline int subpage_protection(pgd_t *pgdir, unsigned long ea) */ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) { - void *pgdir; + enum ctx_state prev_state = exception_enter(); + pgd_t *pgdir; unsigned long vsid; struct mm_struct *mm; pte_t *ptep; - cpumask_t tmp; + unsigned hugeshift; + const struct cpumask *tmp; int rc, user_region = 0, local = 0; int psize, ssize; DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", ea, access, trap); - if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) { - DBG_LOW(" out of pgtable range !\n"); - return 1; - } - /* Get region & vsid */ switch (REGION_ID(ea)) { case USER_REGION_ID: @@ -738,13 +1023,10 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) mm = current->mm; if (! mm) { DBG_LOW(" user region with no mm !\n"); - return 1; + rc = 1; + goto bail; } -#ifdef CONFIG_PPC_MM_SLICES psize = get_slice_psize(mm, ea); -#else - psize = mm->context.user_psize; -#endif ssize = user_segment_size(ea); vsid = get_vsid(mm->context.id, ea, ssize); break; @@ -761,62 +1043,94 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) /* Not a valid range * Send the problem up to do_page_fault */ - return 1; + rc = 1; + goto bail; } DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); + /* Bad address. */ + if (!vsid) { + DBG_LOW("Bad address!\n"); + rc = 1; + goto bail; + } /* Get pgdir */ pgdir = mm->pgd; - if (pgdir == NULL) - return 1; + if (pgdir == NULL) { + rc = 1; + goto bail; + } /* Check CPU locality */ - tmp = cpumask_of_cpu(smp_processor_id()); - if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) + tmp = cpumask_of(smp_processor_id()); + if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) local = 1; -#ifdef CONFIG_HUGETLB_PAGE - /* Handle hugepage regions */ - if (HPAGE_SHIFT && psize == mmu_huge_psize) { - DBG_LOW(" -> huge page !\n"); - return hash_huge_page(mm, access, ea, vsid, local, trap); - } -#endif /* CONFIG_HUGETLB_PAGE */ - #ifndef CONFIG_PPC_64K_PAGES - /* If we use 4K pages and our psize is not 4K, then we are hitting - * a special driver mapping, we need to align the address before - * we fetch the PTE + /* If we use 4K pages and our psize is not 4K, then we might + * be hitting a special driver mapping, and need to align the + * address before we fetch the PTE. + * + * It could also be a hugepage mapping, in which case this is + * not necessary, but it's not harmful, either. */ if (psize != MMU_PAGE_4K) ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); #endif /* CONFIG_PPC_64K_PAGES */ /* Get PTE and page size from page tables */ - ptep = find_linux_pte(pgdir, ea); + ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); - return 1; + rc = 1; + goto bail; } -#ifndef CONFIG_PPC_64K_PAGES - DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); -#else - DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), - pte_val(*(ptep + PTRS_PER_PTE))); -#endif + /* Add _PAGE_PRESENT to the required access perm */ + access |= _PAGE_PRESENT; + /* Pre-check access permissions (will be re-checked atomically * in __hash_page_XX but this pre-check is a fast path */ if (access & ~pte_val(*ptep)) { DBG_LOW(" no access !\n"); - return 1; + rc = 1; + goto bail; } + if (hugeshift) { + if (pmd_trans_huge(*(pmd_t *)ptep)) + rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, + trap, local, ssize, psize); +#ifdef CONFIG_HUGETLB_PAGE + else + rc = __hash_page_huge(ea, access, vsid, ptep, trap, + local, ssize, hugeshift, psize); +#else + else { + /* + * if we have hugeshift, and is not transhuge with + * hugetlb disabled, something is really wrong. + */ + rc = 1; + WARN_ON(1); + } +#endif + check_paca_psize(ea, mm, psize, user_region); + + goto bail; + } + +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif /* Do actual hashing */ #ifdef CONFIG_PPC_64K_PAGES /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ - if (pte_val(*ptep) & _PAGE_4K_PFN) { + if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { demote_segment_4k(mm, ea); psize = MMU_PAGE_4K; } @@ -844,17 +1158,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) #endif } } - if (user_region) { - if (psize != get_paca()->context.user_psize) { - get_paca()->context = mm->context; - slb_flush_and_rebolt(); - } - } else if (get_paca()->vmalloc_sllp != - mmu_psize_defs[mmu_vmalloc_psize].sllp) { - get_paca()->vmalloc_sllp = - mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_vmalloc_update(); - } + + check_paca_psize(ea, mm, psize, user_region); #endif /* CONFIG_PPC_64K_PAGES */ #ifdef CONFIG_PPC_HAS_HASH_64K @@ -863,7 +1168,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) else #endif /* CONFIG_PPC_HAS_HASH_64K */ { - int spp = subpage_protection(pgdir, ea); + int spp = subpage_protection(mm, ea); if (access & spp) rc = -2; else @@ -871,6 +1176,12 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) local, ssize, spp); } + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, psize, + psize, pte_val(*ptep)); #ifndef CONFIG_PPC_64K_PAGES DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); #else @@ -878,6 +1189,9 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) pte_val(*(ptep + PTRS_PER_PTE))); #endif DBG_LOW(" -> rc=%d\n", rc); + +bail: + exception_exit(prev_state); return rc; } EXPORT_SYMBOL_GPL(hash_page); @@ -885,13 +1199,12 @@ EXPORT_SYMBOL_GPL(hash_page); void hash_preload(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap) { + int hugepage_shift; unsigned long vsid; - void *pgdir; + pgd_t *pgdir; pte_t *ptep; - cpumask_t mask; unsigned long flags; - int local = 0; - int ssize; + int rc, ssize, local = 0; BUG_ON(REGION_ID(ea) != USER_REGION_ID); @@ -908,10 +1221,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pgdir = mm->pgd; if (pgdir == NULL) return; - ptep = find_linux_pte(pgdir, ea); - if (!ptep) + + /* Get VSID */ + ssize = user_segment_size(ea); + vsid = get_vsid(mm->context.id, ea, ssize); + if (!vsid) return; + /* + * Hash doesn't like irqs. Walking linux page table with irq disabled + * saves us from holding multiple locks. + */ + local_irq_save(flags); + /* + * THP pages use update_mmu_cache_pmd. We don't do + * hash preload there. Hence can ignore THP here + */ + ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); + if (!ptep) + goto out_exit; + + WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take @@ -920,52 +1250,73 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, * page size demotion here */ if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) - return; + goto out_exit; #endif /* CONFIG_PPC_64K_PAGES */ - /* Get VSID */ - ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); - - /* Hash doesn't like irqs */ - local_irq_save(flags); - /* Is that local to this CPU ? */ - mask = cpumask_of_cpu(smp_processor_id()); - if (cpus_equal(mm->cpu_vm_mask, mask)) + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) local = 1; /* Hash it in */ #ifdef CONFIG_PPC_HAS_HASH_64K if (mm->context.user_psize == MMU_PAGE_64K) - __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); + rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); else #endif /* CONFIG_PPC_HAS_HASH_64K */ - __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, - subpage_protection(pgdir, ea)); + rc = __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, + subpage_protection(mm, ea)); + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, + mm->context.user_psize, + mm->context.user_psize, + pte_val(*ptep)); +out_exit: local_irq_restore(flags); } /* WARNING: This is called from hash_low_64.S, if you change this prototype, * do not forget to update the assembly call site ! */ -void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize, +void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, int local) { unsigned long hash, index, shift, hidx, slot; - DBG_LOW("flush_hash_page(va=%016x)\n", va); - pte_iterate_hashed_subpages(pte, psize, va, index, shift) { - hash = hpt_hash(va, shift, ssize); + DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); hidx = __rpte_to_hidx(pte, index); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; - DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx); - ppc_md.hpte_invalidate(slot, va, psize, ssize, local); + DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); + /* + * We use same base page size and actual psize, because we don't + * use these functions for hugepage + */ + ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local); } pte_iterate_hashed_end(); + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* Transactions are not aborted by tlbiel, only tlbie. + * Without, syncing a page back to a block device w/ PIO could pick up + * transactional data (bad!) so we force an abort here. Before the + * sync the page will be made read-only, which will flush_hash_page. + * BIG ISSUE here: if the kernel uses a page from userspace without + * unmapping it first, it may see the speculated version. + */ + if (local && cpu_has_feature(CPU_FTR_TM) && + current->thread.regs && + MSR_TM_ACTIVE(current->thread.regs->msr)) { + tm_enable(); + tm_abort(TM_CAUSE_TLBI); + } +#endif } void flush_hash_range(unsigned long number, int local) @@ -978,7 +1329,7 @@ void flush_hash_range(unsigned long number, int local) &__get_cpu_var(ppc64_tlb_batch); for (i = 0; i < number; i++) - flush_hash_page(batch->vaddr[i], batch->pte[i], + flush_hash_page(batch->vpn[i], batch->pte[i], batch->psize, batch->ssize, local); } } @@ -989,6 +1340,8 @@ void flush_hash_range(unsigned long number, int local) */ void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) { + enum ctx_state prev_state = exception_enter(); + if (user_mode(regs)) { #ifdef CONFIG_PPC_SUBPAGE_PROT if (rc == -2) @@ -998,24 +1351,64 @@ void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) _exception(SIGBUS, regs, BUS_ADRERR, address); } else bad_page_fault(regs, address, SIGBUS); + + exception_exit(prev_state); +} + +long hpte_insert_repeating(unsigned long hash, unsigned long vpn, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize, int ssize) +{ + unsigned long hpte_group; + long slot; + +repeat: + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags, + psize, psize, ssize); + + /* Primary is full, try the secondary */ + if (unlikely(slot == -1)) { + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, + vflags | HPTE_V_SECONDARY, + psize, psize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP)&~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + + return slot; } #ifdef CONFIG_DEBUG_PAGEALLOC static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { - unsigned long hash, hpteg; + unsigned long hash; unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize); - unsigned long mode = _PAGE_ACCESSED | _PAGE_DIRTY | - _PAGE_COHERENT | PP_RWXX | HPTE_R_N; - int ret; + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL); + long ret; + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + + /* Don't create HPTE entries for bad address */ + if (!vsid) + return; - hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize); - hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, + HPTE_V_BOLTED, + mmu_linear_psize, mmu_kernel_ssize); - ret = ppc_md.hpte_insert(hpteg, va, __pa(vaddr), - mode, HPTE_V_BOLTED, - mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); spin_lock(&linear_map_hash_lock); BUG_ON(linear_map_hash_slots[lmi] & 0x80); @@ -1027,9 +1420,9 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) { unsigned long hash, hidx, slot; unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize); + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); spin_lock(&linear_map_hash_lock); BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); hidx = linear_map_hash_slots[lmi] & 0x7f; @@ -1039,7 +1432,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; - ppc_md.hpte_invalidate(slot, va, mmu_linear_psize, mmu_kernel_ssize, 0); + ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize, + mmu_kernel_ssize, 0); } void kernel_map_pages(struct page *page, int numpages, int enable) @@ -1061,3 +1455,23 @@ void kernel_map_pages(struct page *page, int numpages, int enable) local_irq_restore(flags); } #endif /* CONFIG_DEBUG_PAGEALLOC */ + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* On LPAR systems, the first entry is our RMA region, + * non-LPAR 64-bit hash MMU systems don't have a limitation + * on real mode access, but using the first entry works well + * enough. We also clamp it to 1G to avoid some funky things + * such as RTAS bugs etc... + */ + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(ppc64_rma_size); +} diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c new file mode 100644 index 00000000000..e7450bdbe83 --- /dev/null +++ b/arch/powerpc/mm/highmem.c @@ -0,0 +1,86 @@ +/* + * highmem.c: virtual kernel memory mappings for high memory + * + * PowerPC version, stolen from the i386 version. + * + * Used in CONFIG_HIGHMEM systems for memory pages which + * are not addressable by direct kernel virtual addresses. + * + * Copyright (C) 1999 Gerhard Wichert, Siemens AG + * Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * up to 16 Terrabyte physical memory. With current x86 CPUs + * we now support up to 64 Gigabytes physical RAM. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * + * Reworked for PowerPC by various contributors. Moved from + * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. + */ + +#include <linux/highmem.h> +#include <linux/module.h> + +/* + * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap + * gives a more generic (and caching) interface. But kmap_atomic can + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ +void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + unsigned long vaddr; + int idx, type; + + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + + type = kmap_atomic_idx_push(); + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +#ifdef CONFIG_DEBUG_HIGHMEM + BUG_ON(!pte_none(*(kmap_pte-idx))); +#endif + __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); + local_flush_tlb_page(NULL, vaddr); + + return (void*) vaddr; +} +EXPORT_SYMBOL(kmap_atomic_prot); + +void __kunmap_atomic(void *kvaddr) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + int type; + + if (vaddr < __fix_to_virt(FIX_KMAP_END)) { + pagefault_enable(); + return; + } + + type = kmap_atomic_idx(); + +#ifdef CONFIG_DEBUG_HIGHMEM + { + unsigned int idx; + + idx = type + KM_TYPE_NR * smp_processor_id(); + BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(&init_mm, vaddr, kmap_pte-idx); + local_flush_tlb_page(NULL, vaddr); + } +#endif + + kmap_atomic_idx_pop(); + pagefault_enable(); +} +EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c new file mode 100644 index 00000000000..826893fcb3a --- /dev/null +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -0,0 +1,179 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +/* + * PPC64 THP Support for hash based MMUs + */ +#include <linux/mm.h> +#include <asm/machdep.h> + +int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, + pmd_t *pmdp, unsigned long trap, int local, int ssize, + unsigned int psize) +{ + unsigned int index, valid; + unsigned char *hpte_slot_array; + unsigned long rflags, pa, hidx; + unsigned long old_pmd, new_pmd; + int ret, lpsize = MMU_PAGE_16M; + unsigned long vpn, hash, shift, slot; + + /* + * atomically mark the linux large page PMD busy and dirty + */ + do { + old_pmd = pmd_val(*pmdp); + /* If PMD busy, retry the access */ + if (unlikely(old_pmd & _PAGE_BUSY)) + return 0; + /* If PMD is trans splitting retry the access */ + if (unlikely(old_pmd & _PAGE_SPLITTING)) + return 0; + /* If PMD permissions don't match, take page fault */ + if (unlikely(access & ~old_pmd)) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_RW) + new_pmd |= _PAGE_DIRTY; + } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, + old_pmd, new_pmd)); + /* + * PP bits. _PAGE_USER is already PP bit 0x2, so we only + * need to add in 0x1 if it's a read-only user page + */ + rflags = new_pmd & _PAGE_USER; + if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) && + (new_pmd & _PAGE_DIRTY))) + rflags |= 0x1; + /* + * _PAGE_EXEC -> HW_NO_EXEC since it's inverted + */ + rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N); + +#if 0 + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + } +#endif + /* + * Find the slot index details for this ea, using base page size. + */ + shift = mmu_psize_defs[psize].shift; + index = (ea & ~HPAGE_PMD_MASK) >> shift; + BUG_ON(index >= 4096); + + vpn = hpt_vpn(ea, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + hpte_slot_array = get_hpte_slot_array(pmdp); + + valid = hpte_valid(hpte_slot_array, index); + if (valid) { + /* update the hpte bits */ + hidx = hpte_hash_index(hpte_slot_array, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + ret = ppc_md.hpte_updatepp(slot, rflags, vpn, + psize, lpsize, ssize, local); + /* + * We failed to update, try to insert a new entry. + */ + if (ret == -1) { + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + valid = 0; + new_pmd &= ~_PAGE_HPTEFLAGS; + hpte_slot_array[index] = 0; + } else + /* clear the busy bits and set the hash pte bits */ + new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + } + + if (!valid) { + unsigned long hpte_group; + + /* insert new entry */ + pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; +repeat: + hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + + /* clear the busy bits and set the hash pte bits */ + new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + + /* Add in WIMG bits */ + rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | + _PAGE_GUARDED)); + /* + * enable the memory coherence always + */ + rflags |= HPTE_R_M; + + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0, + psize, lpsize, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, + rflags, HPTE_V_SECONDARY, + psize, lpsize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pmd and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *pmdp = __pmd(old_pmd); + hash_failure_debug(ea, access, vsid, trap, ssize, + psize, lpsize, old_pmd); + return -1; + } + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + mark_hpte_slot_valid(hpte_slot_array, index, slot); + } + /* + * No need to use ldarx/stdcx here + */ + *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c new file mode 100644 index 00000000000..5e4ee257390 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-book3e.c @@ -0,0 +1,153 @@ +/* + * PPC Huge TLB Page Support for Book3E MMU + * + * Copyright (C) 2009 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + */ +#include <linux/mm.h> +#include <linux/hugetlb.h> + +#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC64 +static inline int tlb1_next(void) +{ + struct paca_struct *paca = get_paca(); + struct tlb_core_data *tcd; + int this, next; + + tcd = paca->tcd_ptr; + this = tcd->esel_next; + + next = this + 1; + if (next >= tcd->esel_max) + next = tcd->esel_first; + + tcd->esel_next = next; + return this; +} +#else +static inline int tlb1_next(void) +{ + int index, ncams; + + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + index = __get_cpu_var(next_tlbcam_idx); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __get_cpu_var(next_tlbcam_idx) = tlbcam_index; + else + __get_cpu_var(next_tlbcam_idx)++; + + return index; +} +#endif /* !PPC64 */ +#endif /* FSL */ + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} + +static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) +{ + int found = 0; + + mtspr(SPRN_MAS6, pid << 16); + if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { + asm volatile( + "li %0,0\n" + "tlbsx. 0,%1\n" + "bne 1f\n" + "li %0,1\n" + "1:\n" + : "=&r"(found) : "r"(ea)); + } else { + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); + } + + return found; +} + +void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, + pte_t pte) +{ + unsigned long mas1, mas2; + u64 mas7_3; + unsigned long psize, tsize, shift; + unsigned long flags; + struct mm_struct *mm; + +#ifdef CONFIG_PPC_FSL_BOOK3E + int index; +#endif + + if (unlikely(is_kernel_addr(ea))) + return; + + mm = vma->vm_mm; + +#ifdef CONFIG_PPC_MM_SLICES + psize = get_slice_psize(mm, ea); + tsize = mmu_get_tsize(psize); + shift = mmu_psize_defs[psize].shift; +#else + psize = vma_mmu_pagesize(vma); + shift = __ilog2(psize); + tsize = shift - 10; +#endif + + /* + * We can't be interrupted while we're setting up the MAS + * regusters or after we've confirmed that no tlb exists. + */ + local_irq_save(flags); + + if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { + local_irq_restore(flags); + return; + } + +#ifdef CONFIG_PPC_FSL_BOOK3E + /* We have to use the CAM(TLB1) on FSL parts for hugepages */ + index = tlb1_next(); + mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); +#endif + + mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); + mas2 = ea & ~((1UL << shift) - 1); + mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; + mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; + if (!pte_dirty(pte)) + mas7_3 &= ~(MAS3_SW|MAS3_UW); + + mtspr(SPRN_MAS1, mas1); + mtspr(SPRN_MAS2, mas2); + + if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { + mtspr(SPRN_MAS7_MAS3, mas7_3); + } else { + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); + } + + asm volatile ("tlbwe"); + + local_irq_restore(flags); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct hstate *hstate = hstate_file(vma->vm_file); + unsigned long tsize = huge_page_shift(hstate) - 10; + + __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0); +} diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c new file mode 100644 index 00000000000..a5bcf930119 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -0,0 +1,129 @@ +/* + * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/cacheflush.h> +#include <asm/machdep.h> + +extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, + unsigned long pa, unsigned long rlags, + unsigned long vflags, int psize, int ssize); + +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, int local, int ssize, + unsigned int shift, unsigned int mmu_psize) +{ + unsigned long vpn; + unsigned long old_pte, new_pte; + unsigned long rflags, pa, sz; + long slot; + + BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); + + /* Search the Linux page table for a match with va */ + vpn = hpt_vpn(ea, vsid, ssize); + + /* At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + do { + old_pte = pte_val(*ptep); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & _PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(access & ~old_pte)) + return 1; + /* Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access */ + new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_RW) + new_pte |= _PAGE_DIRTY; + } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, + old_pte, new_pte)); + + rflags = 0x2 | (!(new_pte & _PAGE_RW)); + /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ + rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); + sz = ((1UL) << shift); + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(old_pte & _PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long hash, slot; + + hash = hpt_hash(vpn, shift, ssize); + if (old_pte & _PAGE_F_SECOND) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += (old_pte & _PAGE_F_GIX) >> 12; + + if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, + mmu_psize, ssize, local) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & _PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(vpn, shift, ssize); + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + + /* clear HPTE slot informations in new PTE */ +#ifdef CONFIG_PPC_64K_PAGES + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; +#else + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; +#endif + /* Add in WIMG bits */ + rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | + _PAGE_COHERENT | _PAGE_GUARDED)); + /* + * enable the memory coherence always + */ + rflags |= HPTE_R_M; + + slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, + mmu_psize, ssize); + + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + mmu_psize, mmu_psize, old_pte); + return -1; + } + + new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); + } + + /* + * No need to use ldarx/stdcx here + */ + *ptep = __pte(new_pte & ~_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a02266dad21..7e70ae968e5 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1,175 +1,529 @@ /* - * PPC64 (POWER4) Huge TLB Page Support for Kernel. + * PPC Huge TLB Page Support for Kernel. * * Copyright (C) 2003 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor * * Based on the IA-32 version: * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> */ -#include <linux/init.h> -#include <linux/fs.h> #include <linux/mm.h> -#include <linux/hugetlb.h> -#include <linux/pagemap.h> +#include <linux/io.h> #include <linux/slab.h> -#include <linux/err.h> -#include <linux/sysctl.h> -#include <asm/mman.h> +#include <linux/hugetlb.h> +#include <linux/export.h> +#include <linux/of_fdt.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> +#include <linux/moduleparam.h> +#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlb.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/machdep.h> -#include <asm/cputable.h> -#include <asm/spu.h> +#include <asm/setup.h> +#include <asm/hugetlb.h> -#define HPAGE_SHIFT_64K 16 -#define HPAGE_SHIFT_16M 24 +#ifdef CONFIG_HUGETLB_PAGE -#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) -#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) +#define PAGE_SHIFT_64K 16 +#define PAGE_SHIFT_16M 24 +#define PAGE_SHIFT_16G 34 -unsigned int hugepte_shift; -#define PTRS_PER_HUGEPTE (1 << hugepte_shift) -#define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << hugepte_shift) +unsigned int HPAGE_SHIFT; -#define HUGEPD_SHIFT (HPAGE_SHIFT + hugepte_shift) -#define HUGEPD_SIZE (1UL << HUGEPD_SHIFT) -#define HUGEPD_MASK (~(HUGEPD_SIZE-1)) +/* + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready. On non-Freescale implementations, this is + * just used to track 16G pages and so is a single array. FSL-based + * implementations may have more than one gpage size, so we need multiple + * arrays + */ +#ifdef CONFIG_PPC_FSL_BOOK3E +#define MAX_NUMBER_GPAGES 128 +struct psize_gpages { + u64 gpage_list[MAX_NUMBER_GPAGES]; + unsigned int nr_gpages; +}; +static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; +#else +#define MAX_NUMBER_GPAGES 1024 +static u64 gpage_freearray[MAX_NUMBER_GPAGES]; +static unsigned nr_gpages; +#endif -#define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM]) +#define hugepd_none(hpd) ((hpd).pd == 0) -/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() - * will choke on pointers to hugepte tables, which is handy for - * catching screwups early. */ -#define HUGEPD_OK 0x1 +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * At this point we do the placement change only for BOOK3S 64. This would + * possibly work on other subarchs. + */ -typedef struct { unsigned long pd; } hugepd_t; +/* + * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have + * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; + */ +int pmd_huge(pmd_t pmd) +{ + /* + * leaf pte for huge page, bottom two bits != 00 + */ + return ((pmd_val(pmd) & 0x3) != 0x0); +} -#define hugepd_none(hpd) ((hpd).pd == 0) +int pud_huge(pud_t pud) +{ + /* + * leaf pte for huge page, bottom two bits != 00 + */ + return ((pud_val(pud) & 0x3) != 0x0); +} + +int pgd_huge(pgd_t pgd) +{ + /* + * leaf pte for huge page, bottom two bits != 00 + */ + return ((pgd_val(pgd) & 0x3) != 0x0); +} +#else +int pmd_huge(pmd_t pmd) +{ + return 0; +} -static inline pte_t *hugepd_page(hugepd_t hpd) +int pud_huge(pud_t pud) { - BUG_ON(!(hpd.pd & HUGEPD_OK)); - return (pte_t *)(hpd.pd & ~HUGEPD_OK); + return 0; } -static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr) +int pgd_huge(pgd_t pgd) { - unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1)); - pte_t *dir = hugepd_page(*hpdp); + return 0; +} +#endif - return dir + idx; +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + /* Only called for hugetlbfs pages, hence can ignore THP */ + return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, - unsigned long address) + unsigned long address, unsigned pdshift, unsigned pshift) { - pte_t *new = kmem_cache_alloc(huge_pgtable_cache, - GFP_KERNEL|__GFP_REPEAT); + struct kmem_cache *cachep; + pte_t *new; + +#ifdef CONFIG_PPC_FSL_BOOK3E + int i; + int num_hugepd = 1 << (pshift - pdshift); + cachep = hugepte_cache; +#else + cachep = PGT_CACHE(pdshift - pshift); +#endif + + new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT); + + BUG_ON(pshift > HUGEPD_SHIFT_MASK); + BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); if (! new) return -ENOMEM; spin_lock(&mm->page_table_lock); +#ifdef CONFIG_PPC_FSL_BOOK3E + /* + * We have multiple higher-level entries that point to the same + * actual pte location. Fill in each as we go and backtrack on error. + * We need all of these so the DTLB pgtable walk code can find the + * right higher-level entry without knowing if it's a hugepage or not. + */ + for (i = 0; i < num_hugepd; i++, hpdp++) { + if (unlikely(!hugepd_none(*hpdp))) + break; + else + /* We use the old format for PPC_FSL_BOOK3E */ + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; + } + /* If we bailed from the for loop early, an error occurred, clean up */ + if (i < num_hugepd) { + for (i = i - 1 ; i >= 0; i--, hpdp--) + hpdp->pd = 0; + kmem_cache_free(cachep, new); + } +#else if (!hugepd_none(*hpdp)) - kmem_cache_free(huge_pgtable_cache, new); - else - hpdp->pd = (unsigned long)new | HUGEPD_OK; + kmem_cache_free(cachep, new); + else { +#ifdef CONFIG_PPC_BOOK3S_64 + hpdp->pd = (unsigned long)new | + (shift_to_mmu_psize(pshift) << 2); +#else + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; +#endif + } +#endif spin_unlock(&mm->page_table_lock); return 0; } -/* Base page size affects how we walk hugetlb page tables */ -#ifdef CONFIG_PPC_64K_PAGES -#define hpmd_offset(pud, addr) pmd_offset(pud, addr) -#define hpmd_alloc(mm, pud, addr) pmd_alloc(mm, pud, addr) +/* + * These macros define how to determine which level of the page table holds + * the hpdp. + */ +#ifdef CONFIG_PPC_FSL_BOOK3E +#define HUGEPD_PGD_SHIFT PGDIR_SHIFT +#define HUGEPD_PUD_SHIFT PUD_SHIFT #else -static inline -pmd_t *hpmd_offset(pud_t *pud, unsigned long addr) -{ - if (HPAGE_SHIFT == HPAGE_SHIFT_64K) - return pmd_offset(pud, addr); - else - return (pmd_t *) pud; -} -static inline -pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr) -{ - if (HPAGE_SHIFT == HPAGE_SHIFT_64K) - return pmd_alloc(mm, pud, addr); - else - return (pmd_t *) pud; -} +#define HUGEPD_PGD_SHIFT PUD_SHIFT +#define HUGEPD_PUD_SHIFT PMD_SHIFT #endif -/* Modelled after find_linux_pte() */ -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * At this point we do the placement change only for BOOK3S 64. This would + * possibly work on other subarchs. + */ +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pg; pud_t *pu; pmd_t *pm; + hugepd_t *hpdp = NULL; + unsigned pshift = __ffs(sz); + unsigned pdshift = PGDIR_SHIFT; - BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize); - - addr &= HPAGE_MASK; - + addr &= ~(sz-1); pg = pgd_offset(mm, addr); - if (!pgd_none(*pg)) { - pu = pud_offset(pg, addr); - if (!pud_none(*pu)) { - pm = hpmd_offset(pu, addr); - if (!pmd_none(*pm)) - return hugepte_offset((hugepd_t *)pm, addr); + + if (pshift == PGDIR_SHIFT) + /* 16GB huge page */ + return (pte_t *) pg; + else if (pshift > PUD_SHIFT) + /* + * We need to use hugepd table + */ + hpdp = (hugepd_t *)pg; + else { + pdshift = PUD_SHIFT; + pu = pud_alloc(mm, pg, addr); + if (pshift == PUD_SHIFT) + return (pte_t *)pu; + else if (pshift > PMD_SHIFT) + hpdp = (hugepd_t *)pu; + else { + pdshift = PMD_SHIFT; + pm = pmd_alloc(mm, pu, addr); + if (pshift == PMD_SHIFT) + /* 16MB hugepage */ + return (pte_t *)pm; + else + hpdp = (hugepd_t *)pm; } } + if (!hpdp) + return NULL; - return NULL; + BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); + + if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) + return NULL; + + return hugepte_offset(hpdp, addr, pdshift); } -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +#else + +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pg; pud_t *pu; pmd_t *pm; hugepd_t *hpdp = NULL; + unsigned pshift = __ffs(sz); + unsigned pdshift = PGDIR_SHIFT; - BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize); - - addr &= HPAGE_MASK; + addr &= ~(sz-1); pg = pgd_offset(mm, addr); - pu = pud_alloc(mm, pg, addr); - if (pu) { - pm = hpmd_alloc(mm, pu, addr); - if (pm) + if (pshift >= HUGEPD_PGD_SHIFT) { + hpdp = (hugepd_t *)pg; + } else { + pdshift = PUD_SHIFT; + pu = pud_alloc(mm, pg, addr); + if (pshift >= HUGEPD_PUD_SHIFT) { + hpdp = (hugepd_t *)pu; + } else { + pdshift = PMD_SHIFT; + pm = pmd_alloc(mm, pu, addr); hpdp = (hugepd_t *)pm; + } } - if (! hpdp) + if (!hpdp) return NULL; - if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr)) + BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); + + if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) return NULL; - return hugepte_offset(hpdp, addr); + return hugepte_offset(hpdp, addr, pdshift); +} +#endif + +#ifdef CONFIG_PPC_FSL_BOOK3E +/* Build list of addresses of gigantic pages. This function is used in early + * boot before the buddy or bootmem allocator is setup. + */ +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +{ + unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); + int i; + + if (addr == 0) + return; + + gpage_freearray[idx].nr_gpages = number_of_pages; + + for (i = 0; i < number_of_pages; i++) { + gpage_freearray[idx].gpage_list[i] = addr; + addr += page_size; + } +} + +/* + * Moves the gigantic page addresses from the temporary list to the + * huge_boot_pages list. + */ +int alloc_bootmem_huge_page(struct hstate *hstate) +{ + struct huge_bootmem_page *m; + int idx = shift_to_mmu_psize(huge_page_shift(hstate)); + int nr_gpages = gpage_freearray[idx].nr_gpages; + + if (nr_gpages == 0) + return 0; + +#ifdef CONFIG_HIGHMEM + /* + * If gpages can be in highmem we can't use the trick of storing the + * data structure in the page; allocate space for this + */ + m = alloc_bootmem(sizeof(struct huge_bootmem_page)); + m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; +#else + m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); +#endif + + list_add(&m->list, &huge_boot_pages); + gpage_freearray[idx].nr_gpages = nr_gpages; + gpage_freearray[idx].gpage_list[nr_gpages] = 0; + m->hstate = hstate; + + return 1; +} +/* + * Scan the command line hugepagesz= options for gigantic pages; store those in + * a list that we use to allocate the memory once all options are parsed. + */ + +unsigned long gpage_npages[MMU_PAGE_COUNT]; + +static int __init do_gpage_early_setup(char *param, char *val, + const char *unused) +{ + static phys_addr_t size; + unsigned long npages; + + /* + * The hugepagesz and hugepages cmdline options are interleaved. We + * use the size variable to keep track of whether or not this was done + * properly and skip over instances where it is incorrect. Other + * command-line parsing code will issue warnings, so we don't need to. + * + */ + if ((strcmp(param, "default_hugepagesz") == 0) || + (strcmp(param, "hugepagesz") == 0)) { + size = memparse(val, NULL); + } else if (strcmp(param, "hugepages") == 0) { + if (size != 0) { + if (sscanf(val, "%lu", &npages) <= 0) + npages = 0; + gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; + size = 0; + } + } + return 0; } + +/* + * This function allocates physical space for pages that are larger than the + * buddy allocator can handle. We want to allocate these in highmem because + * the amount of lowmem is limited. This means that this function MUST be + * called before lowmem_end_addr is set up in MMU_init() in order for the lmb + * allocate to grab highmem. + */ +void __init reserve_hugetlb_gpages(void) +{ + static __initdata char cmdline[COMMAND_LINE_SIZE]; + phys_addr_t size, base; + int i; + + strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, + &do_gpage_early_setup); + + /* + * Walk gpage list in reverse, allocating larger page sizes first. + * Skip over unsupported sizes, or sizes that have 0 gpages allocated. + * When we reach the point in the list where pages are no longer + * considered gpages, we're done. + */ + for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { + if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) + continue; + else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) + break; + + size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); + base = memblock_alloc_base(size * gpage_npages[i], size, + MEMBLOCK_ALLOC_ANYWHERE); + add_gpage(base, size, gpage_npages[i]); + } +} + +#else /* !PPC_FSL_BOOK3E */ + +/* Build list of addresses of gigantic pages. This function is used in early + * boot before the buddy or bootmem allocator is setup. + */ +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +{ + if (!addr) + return; + while (number_of_pages > 0) { + gpage_freearray[nr_gpages] = addr; + nr_gpages++; + number_of_pages--; + addr += page_size; + } +} + +/* Moves the gigantic page addresses from the temporary list to the + * huge_boot_pages list. + */ +int alloc_bootmem_huge_page(struct hstate *hstate) +{ + struct huge_bootmem_page *m; + if (nr_gpages == 0) + return 0; + m = phys_to_virt(gpage_freearray[--nr_gpages]); + gpage_freearray[nr_gpages] = 0; + list_add(&m->list, &huge_boot_pages); + m->hstate = hstate; + return 1; +} +#endif + int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } -static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp) +#ifdef CONFIG_PPC_FSL_BOOK3E +#define HUGEPD_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) + +struct hugepd_freelist { + struct rcu_head rcu; + unsigned int index; + void *ptes[0]; +}; + +static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); + +static void hugepd_free_rcu_callback(struct rcu_head *head) +{ + struct hugepd_freelist *batch = + container_of(head, struct hugepd_freelist, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + kmem_cache_free(hugepte_cache, batch->ptes[i]); + + free_page((unsigned long)batch); +} + +static void hugepd_free(struct mmu_gather *tlb, void *hugepte) +{ + struct hugepd_freelist **batchp; + + batchp = &get_cpu_var(hugepd_freelist_cur); + + if (atomic_read(&tlb->mm->mm_users) < 2 || + cpumask_equal(mm_cpumask(tlb->mm), + cpumask_of(smp_processor_id()))) { + kmem_cache_free(hugepte_cache, hugepte); + put_cpu_var(hugepd_freelist_cur); + return; + } + + if (*batchp == NULL) { + *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); + (*batchp)->index = 0; + } + + (*batchp)->ptes[(*batchp)->index++] = hugepte; + if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { + call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); + *batchp = NULL; + } + put_cpu_var(hugepd_freelist_cur); +} +#endif + +static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, + unsigned long start, unsigned long end, + unsigned long floor, unsigned long ceiling) { pte_t *hugepte = hugepd_page(*hpdp); + int i; + + unsigned long pdmask = ~((1UL << pdshift) - 1); + unsigned int num_hugepd = 1; + +#ifdef CONFIG_PPC_FSL_BOOK3E + /* Note: On fsl the hpdp may be the first of several */ + num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift)); +#else + unsigned int shift = hugepd_shift(*hpdp); +#endif + + start &= pdmask; + if (start < floor) + return; + if (ceiling) { + ceiling &= pdmask; + if (! ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + for (i = 0; i < num_hugepd; i++, hpdp++) + hpdp->pd = 0; - hpdp->pd = 0; tlb->need_flush = 1; - pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM, - PGF_CACHENUM_MASK)); + +#ifdef CONFIG_PPC_FSL_BOOK3E + hugepd_free(tlb, hugepte); +#else + pgtable_free_tlb(tlb, hugepte, pdshift - shift); +#endif } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -181,13 +535,29 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long start; start = addr; - pmd = pmd_offset(pud, addr); do { + pmd = pmd_offset(pud, addr); next = pmd_addr_end(addr, end); - if (pmd_none(*pmd)) + if (!is_hugepd(pmd)) { + /* + * if it is not hugepd pointer, we should already find + * it cleared. + */ + WARN_ON(!pmd_none_or_clear_bad(pmd)); continue; - free_hugepte_range(tlb, (hugepd_t *)pmd); - } while (pmd++, addr = next, addr != end); + } +#ifdef CONFIG_PPC_FSL_BOOK3E + /* + * Increment next by the size of the huge mapping since + * there may be more than one entry at this level for a + * single hugepage, but all of them point to + * the same kmem cache that holds the hugepte. + */ + next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); +#endif + free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, + addr, next, floor, ceiling); + } while (addr = next, addr != end); start &= PUD_MASK; if (start < floor) @@ -202,7 +572,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -214,25 +584,28 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long start; start = addr; - pud = pud_offset(pgd, addr); do { + pud = pud_offset(pgd, addr); next = pud_addr_end(addr, end); -#ifdef CONFIG_PPC_64K_PAGES - if (pud_none_or_clear_bad(pud)) - continue; - hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling); -#else - if (HPAGE_SHIFT == HPAGE_SHIFT_64K) { + if (!is_hugepd(pud)) { if (pud_none_or_clear_bad(pud)) continue; - hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling); + hugetlb_free_pmd_range(tlb, pud, addr, next, floor, + ceiling); } else { - if (pud_none(*pud)) - continue; - free_hugepte_range(tlb, (hugepd_t *)pud); - } +#ifdef CONFIG_PPC_FSL_BOOK3E + /* + * Increment next by the size of the huge mapping since + * there may be more than one entry at this level for a + * single hugepage, but all of them point to + * the same kmem cache that holds the hugepte. + */ + next = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); #endif - } while (pud++, addr = next, addr != end); + free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, + addr, next, floor, ceiling); + } + } while (addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) @@ -247,103 +620,57 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held. */ -void hugetlb_free_pgd_range(struct mmu_gather **tlb, +void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; - unsigned long start; /* - * Comments below take from the normal free_pgd_range(). They - * apply here too. The tests against HUGEPD_MASK below are - * essential, because we *don't* test for this at the bottom - * level. Without them we'll attempt to free a hugepte table - * when we unmap just part of it, even if there are other - * active mappings using it. - * - * The next few lines have given us lots of grief... + * Because there are a number of different possible pagetable + * layouts for hugepage ranges, we limit knowledge of how + * things should be laid out to the allocation path + * (huge_pte_alloc(), above). Everything else works out the + * structure as it goes from information in the hugepd + * pointers. That means that we can't here use the + * optimization used in the normal page free_pgd_range(), of + * checking whether we're actually covering a large enough + * range to have to do anything at the top level of the walk + * instead of at the bottom. * - * Why are we testing HUGEPD* at this top level? Because - * often there will be no work to do at all, and we'd prefer - * not to go all the way down to the bottom just to discover - * that. - * - * Why all these "- 1"s? Because 0 represents both the bottom - * of the address space and the top of it (using -1 for the - * top wouldn't help much: the masks would do the wrong thing). - * The rule is that addr 0 and floor 0 refer to the bottom of - * the address space, but end 0 and ceiling 0 refer to the top - * Comparisons need to use "end - 1" and "ceiling - 1" (though - * that end 0 case should be mythical). - * - * Wherever addr is brought up or ceiling brought down, we - * must be careful to reject "the opposite 0" before it - * confuses the subsequent tests. But what about where end is - * brought down by HUGEPD_SIZE below? no, end can't go down to - * 0 there. - * - * Whereas we round start (addr) and ceiling down, by different - * masks at different levels, in order to test whether a table - * now has no other vmas using it, so can be freed, we don't - * bother to round floor or end up - the tests don't need that. + * To make sense of this, you should probably go read the big + * block comment at the top of the normal free_pgd_range(), + * too. */ - addr &= HUGEPD_MASK; - if (addr < floor) { - addr += HUGEPD_SIZE; - if (!addr) - return; - } - if (ceiling) { - ceiling &= HUGEPD_MASK; - if (!ceiling) - return; - } - if (end - 1 > ceiling - 1) - end -= HUGEPD_SIZE; - if (addr > end - 1) - return; - - start = addr; - pgd = pgd_offset((*tlb)->mm, addr); do { - BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize); next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling); - } while (pgd++, addr = next, addr != end); -} - -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - if (pte_present(*ptep)) { - /* We open-code pte_clear because we need to pass the right - * argument to hpte_need_flush (huge / !huge). Might not be - * necessary anymore if we make hpte_need_flush() get the - * page size from the slices - */ - pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1); - } - *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); -} - -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); - return __pte(old); + pgd = pgd_offset(tlb->mm, addr); + if (!is_hugepd(pgd)) { + if (pgd_none_or_clear_bad(pgd)) + continue; + hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); + } else { +#ifdef CONFIG_PPC_FSL_BOOK3E + /* + * Increment next by the size of the huge mapping since + * there may be more than one entry at the pgd level + * for a single hugepage, but all of them point to the + * same kmem cache that holds the hugepte. + */ + next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); +#endif + free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, + addr, next, floor, ceiling); + } + } while (addr = next, addr != end); } struct page * @@ -351,23 +678,26 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { pte_t *ptep; struct page *page; + unsigned shift; + unsigned long mask; + /* + * Transparent hugepages are handled by generic code. We can skip them + * here. + */ + ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); - if (get_slice_psize(mm, address) != mmu_huge_psize) + /* Verify it is a huge page else bail. */ + if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) return ERR_PTR(-EINVAL); - ptep = huge_pte_offset(mm, address); + mask = (1UL << shift) - 1; page = pte_page(*ptep); if (page) - page += (address % HPAGE_SIZE) / PAGE_SIZE; + page += (address & mask) / PAGE_SIZE; return page; } -int pmd_huge(pmd_t pmd) -{ - return 0; -} - struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) @@ -376,244 +706,384 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, + unsigned long sz) +{ + unsigned long __boundary = (addr + sz) & ~(sz-1); + return (__boundary - 1 < end - 1) ? __boundary : end; +} + +int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, + unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(*hugepd); + unsigned long next; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + return 0; + } while (ptep++, addr = next, addr != end); + + return 1; +} +#ifdef CONFIG_PPC_MM_SLICES unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return slice_get_unmapped_area(addr, len, flags, - mmu_huge_psize, 1, 0); + struct hstate *hstate = hstate_file(file); + int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); + + return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); } +#endif -/* - * Called by asm hashtable.S for doing lazy icache flush - */ -static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, - pte_t pte, int trap) +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { - struct page *page; - int i; +#ifdef CONFIG_PPC_MM_SLICES + unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); - if (!pfn_valid(pte_pfn(pte))) - return rflags; + return 1UL << mmu_psize_to_shift(psize); +#else + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; - page = pte_page(pte); + return huge_page_size(hstate_vma(vma)); +#endif +} - /* page is dirty */ - if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { - if (trap == 0x400) { - for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) - __flush_dcache_icache(page_address(page+i)); - set_bit(PG_arch_1, &page->flags); - } else { - rflags |= HPTE_R_N; - } - } - return rflags; +static inline bool is_power_of_4(unsigned long x) +{ + if (is_power_of_2(x)) + return (__ilog2(x) % 2) ? false : true; + return false; } -int hash_huge_page(struct mm_struct *mm, unsigned long access, - unsigned long ea, unsigned long vsid, int local, - unsigned long trap) +static int __init add_huge_page_size(unsigned long long size) { - pte_t *ptep; - unsigned long old_pte, new_pte; - unsigned long va, rflags, pa; - long slot; - int err = 1; - int ssize = user_segment_size(ea); + int shift = __ffs(size); + int mmu_psize; - ptep = huge_pte_offset(mm, ea); + /* Check that it is a page size supported by the hardware and + * that it fits within pagetable and slice limits. */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if ((size < PAGE_SIZE) || !is_power_of_4(size)) + return -EINVAL; +#else + if (!is_power_of_2(size) + || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) + return -EINVAL; +#endif - /* Search the Linux page table for a match with va */ - va = hpt_va(ea, vsid, ssize); + if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) + return -EINVAL; - /* - * If no pte found or not present, send the problem up to - * do_page_fault +#ifdef CONFIG_SPU_FS_64K_LS + /* Disable support for 64K huge pages when 64K SPU local store + * support is enabled as the current implementation conflicts. */ - if (unlikely(!ptep || pte_none(*ptep))) - goto out; + if (shift == PAGE_SHIFT_64K) + return -EINVAL; +#endif /* CONFIG_SPU_FS_64K_LS */ - /* - * Check the user's access rights to the page. If access should be - * prevented then send the problem up to do_page_fault. - */ - if (unlikely(access & ~pte_val(*ptep))) - goto out; - /* - * At this point, we have a pte (old_pte) which can be used to build - * or update an HPTE. There are 2 cases: - * - * 1. There is a valid (present) pte with no associated HPTE (this is - * the most common case) - * 2. There is a valid (present) pte with an associated HPTE. The - * current values of the pp bits in the HPTE prevent access - * because we are doing software DIRTY bit management and the - * page is currently not DIRTY. - */ + BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); + /* Return if huge page size has already been setup */ + if (size_to_hstate(size)) + return 0; - do { - old_pte = pte_val(*ptep); - if (old_pte & _PAGE_BUSY) - goto out; - new_pte = old_pte | _PAGE_BUSY | - _PAGE_ACCESSED | _PAGE_HASHPTE; - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); - - rflags = 0x2 | (!(new_pte & _PAGE_RW)); - /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ - rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - /* No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case */ - rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), - trap); - - /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & _PAGE_HASHPTE)) { - /* There MIGHT be an HPTE for this pte */ - unsigned long hash, slot; - - hash = hpt_hash(va, HPAGE_SHIFT, ssize); - if (old_pte & _PAGE_F_SECOND) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> 12; - - if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize, - ssize, local) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } + hugetlb_add_hstate(shift - PAGE_SHIFT); - if (likely(!(old_pte & _PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize); - unsigned long hpte_group; - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - -repeat: - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - - /* clear HPTE slot informations in new PTE */ - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; - - /* Add in WIMG bits */ - /* XXX We should store these in the pte */ - /* --BenH: I think they are ... */ - rflags |= _PAGE_COHERENT; - - /* Insert into the hash table, primary slot */ - slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, - mmu_huge_psize, ssize); - - /* Primary is full, try the secondary */ - if (unlikely(slot == -1)) { - hpte_group = ((~hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, - HPTE_V_SECONDARY, - mmu_huge_psize, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP)&~0x7UL; - - ppc_md.hpte_remove(hpte_group); - goto repeat; - } - } + return 0; +} + +static int __init hugepage_setup_sz(char *str) +{ + unsigned long long size; + + size = memparse(str, &str); + + if (add_huge_page_size(size) != 0) + printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); + + return 1; +} +__setup("hugepagesz=", hugepage_setup_sz); + +#ifdef CONFIG_PPC_FSL_BOOK3E +struct kmem_cache *hugepte_cache; +static int __init hugetlbpage_init(void) +{ + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + unsigned shift; - if (unlikely(slot == -2)) - panic("hash_huge_page: pte_insert failed\n"); + if (!mmu_psize_defs[psize].shift) + continue; - new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); + shift = mmu_psize_to_shift(psize); + + /* Don't treat normal page sizes as huge... */ + if (shift != PAGE_SHIFT) + if (add_huge_page_size(1ULL << shift) < 0) + continue; } /* - * No need to use ldarx/stdcx here + * Create a kmem cache for hugeptes. The bottom bits in the pte have + * size information encoded in them, so align them to allow this */ - *ptep = __pte(new_pte & ~_PAGE_BUSY); + hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t), + HUGEPD_SHIFT_MASK + 1, 0, NULL); + if (hugepte_cache == NULL) + panic("%s: Unable to create kmem cache for hugeptes\n", + __func__); + + /* Default hpage size = 4M */ + if (mmu_psize_defs[MMU_PAGE_4M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; + else + panic("%s: Unable to set default huge page size\n", __func__); - err = 0; - out: - return err; + return 0; } - -void set_huge_psize(int psize) -{ - /* Check that it is a page size supported by the hardware and - * that it fits within pagetable limits. */ - if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT && - (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT || - mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) { - HPAGE_SHIFT = mmu_psize_defs[psize].shift; - mmu_huge_psize = psize; -#ifdef CONFIG_PPC_64K_PAGES - hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT); #else - if (HPAGE_SHIFT == HPAGE_SHIFT_64K) - hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT); +static int __init hugetlbpage_init(void) +{ + int psize; + + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return -ENODEV; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + unsigned shift; + unsigned pdshift; + + if (!mmu_psize_defs[psize].shift) + continue; + + shift = mmu_psize_to_shift(psize); + + if (add_huge_page_size(1ULL << shift) < 0) + continue; + + if (shift < PMD_SHIFT) + pdshift = PMD_SHIFT; + else if (shift < PUD_SHIFT) + pdshift = PUD_SHIFT; else - hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT); -#endif + pdshift = PGDIR_SHIFT; + /* + * if we have pdshift and shift value same, we don't + * use pgt cache for hugepd. + */ + if (pdshift != shift) { + pgtable_cache_add(pdshift - shift, NULL); + if (!PGT_CACHE(pdshift - shift)) + panic("hugetlbpage_init(): could not create " + "pgtable cache for %d bit pagesize\n", shift); + } + } - } else - HPAGE_SHIFT = 0; + /* Set default large page size. Currently, we pick 16M or 1M + * depending on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; + + return 0; } +#endif +module_init(hugetlbpage_init); -static int __init hugepage_setup_sz(char *str) +void flush_dcache_icache_hugepage(struct page *page) { - unsigned long long size; - int mmu_psize = -1; - int shift; + int i; + void *start; - size = memparse(str, &str); + BUG_ON(!PageCompound(page)); - shift = __ffs(size); - switch (shift) { -#ifndef CONFIG_PPC_64K_PAGES - case HPAGE_SHIFT_64K: - mmu_psize = MMU_PAGE_64K; - break; -#endif - case HPAGE_SHIFT_16M: - mmu_psize = MMU_PAGE_16M; - break; + for (i = 0; i < (1UL << compound_order(page)); i++) { + if (!PageHighMem(page)) { + __flush_dcache_icache(page_address(page+i)); + } else { + start = kmap_atomic(page+i); + __flush_dcache_icache(start); + kunmap_atomic(start); + } } +} - if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift) - set_huge_psize(mmu_psize); - else - printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); +#endif /* CONFIG_HUGETLB_PAGE */ - return 1; -} -__setup("hugepagesz=", hugepage_setup_sz); +/* + * We have 4 cases for pgds and pmds: + * (1) invalid (all zeroes) + * (2) pointer to next table, as normal; bottom 6 bits == 0 + * (3) leaf pte for huge page, bottom two bits != 00 + * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table + * + * So long as we atomically load page table pointers we are safe against teardown, + * we can follow the address down to the the page and take a ref on it. + */ -static void zero_ctor(struct kmem_cache *cache, void *addr) +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) { - memset(addr, 0, kmem_cache_size(cache)); + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t *ret_pte; + hugepd_t *hpdp = NULL; + unsigned pdshift = PGDIR_SHIFT; + + if (shift) + *shift = 0; + + pgdp = pgdir + pgd_index(ea); + pgd = ACCESS_ONCE(*pgdp); + /* + * Always operate on the local stack value. This make sure the + * value don't get updated by a parallel THP split/collapse, + * page fault or a page unmap. The return pte_t * is still not + * stable. So should be checked there for above conditions. + */ + if (pgd_none(pgd)) + return NULL; + else if (pgd_huge(pgd)) { + ret_pte = (pte_t *) pgdp; + goto out; + } else if (is_hugepd(&pgd)) + hpdp = (hugepd_t *)&pgd; + else { + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ + pdshift = PUD_SHIFT; + pudp = pud_offset(&pgd, ea); + pud = ACCESS_ONCE(*pudp); + + if (pud_none(pud)) + return NULL; + else if (pud_huge(pud)) { + ret_pte = (pte_t *) pudp; + goto out; + } else if (is_hugepd(&pud)) + hpdp = (hugepd_t *)&pud; + else { + pdshift = PMD_SHIFT; + pmdp = pmd_offset(&pud, ea); + pmd = ACCESS_ONCE(*pmdp); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + * + * A hugepage split is captured by pmd_trans_splitting + * because we mark the pmd trans splitting and do a + * hpte invalidate + * + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + return NULL; + + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *) pmdp; + goto out; + } else if (is_hugepd(&pmd)) + hpdp = (hugepd_t *)&pmd; + else + return pte_offset_kernel(&pmd, ea); + } + } + if (!hpdp) + return NULL; + + ret_pte = hugepte_offset(hpdp, ea, pdshift); + pdshift = hugepd_shift(*hpdp); +out: + if (shift) + *shift = pdshift; + return ret_pte; } +EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); -static int __init hugetlbpage_init(void) +int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) { - if (!cpu_has_feature(CPU_FTR_16M_PAGE)) - return -ENODEV; + unsigned long mask; + unsigned long pte_end; + struct page *head, *page, *tail; + pte_t pte; + int refs; - huge_pgtable_cache = kmem_cache_create("hugepte_cache", - HUGEPTE_TABLE_SIZE, - HUGEPTE_TABLE_SIZE, - 0, - zero_ctor); - if (! huge_pgtable_cache) - panic("hugetlbpage_init(): could not create hugepte cache\n"); + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; - return 0; -} + pte = ACCESS_ONCE(*ptep); + mask = _PAGE_PRESENT | _PAGE_USER; + if (write) + mask |= _PAGE_RW; -module_init(hugetlbpage_init); + if ((pte_val(pte) & mask) != mask) + return 0; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* + * check for splitting here + */ + if (pmd_trans_splitting(pte_pmd(pte))) + return 0; +#endif + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + + page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c new file mode 100644 index 00000000000..915412e4d5b --- /dev/null +++ b/arch/powerpc/mm/icswx.c @@ -0,0 +1,292 @@ +/* + * ICSWX and ACOP Management + * + * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#include "icswx.h" + +/* + * The processor and its L2 cache cause the icswx instruction to + * generate a COP_REQ transaction on PowerBus. The transaction has no + * address, and the processor does not perform an MMU access to + * authenticate the transaction. The command portion of the PowerBus + * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor + * Process ID (PID), which the coprocessor compares to the authorized + * LPID and PID held in the coprocessor, to determine if the process + * is authorized to generate the transaction. The data of the COP_REQ + * transaction is 128-byte or less in size and is placed in cacheable + * memory on a 128-byte cache line boundary. + * + * The task to use a coprocessor should use use_cop() to mark the use + * of the Coprocessor Type (CT) and context switching. On a server + * class processor, the PID register is used only for coprocessor + * management + * and so a coprocessor PID is allocated before + * executing icswx + * instruction. Drop_cop() is used to free the + * coprocessor PID. + * + * Example: + * Host Fabric Interface (HFI) is a PowerPC network coprocessor. + * Each HFI have multiple windows. Each HFI window serves as a + * network device sending to and receiving from HFI network. + * HFI immediate send function uses icswx instruction. The immediate + * send function allows small (single cache-line) packets be sent + * without using the regular HFI send FIFO and doorbell, which are + * much slower than immediate send. + * + * For each task intending to use HFI immediate send, the HFI driver + * calls use_cop() to obtain a coprocessor PID for the task. + * The HFI driver then allocate a free HFI window and save the + * coprocessor PID to the HFI window to allow the task to use the + * HFI window. + * + * The HFI driver repeatedly creates immediate send packets and + * issues icswx instruction to send data through the HFI window. + * The HFI compares the coprocessor PID in the CPU PID register + * to the PID held in the HFI window to determine if the transaction + * is allowed. + * + * When the task to release the HFI window, the HFI driver calls + * drop_cop() to release the coprocessor PID. + */ + +void switch_cop(struct mm_struct *next) +{ +#ifdef CONFIG_PPC_ICSWX_PID + mtspr(SPRN_PID, next->context.cop_pid); +#endif + mtspr(SPRN_ACOP, next->context.acop); +} + +/** + * Start using a coprocessor. + * @acop: mask of coprocessor to be used. + * @mm: The mm the coprocessor to associate with. Most likely current mm. + * + * Return a positive PID if successful. Negative errno otherwise. + * The returned PID will be fed to the coprocessor to determine if an + * icswx transaction is authenticated. + */ +int use_cop(unsigned long acop, struct mm_struct *mm) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_ICSWX)) + return -ENODEV; + + if (!mm || !acop) + return -EINVAL; + + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); + spin_lock(mm->context.cop_lockp); + + ret = get_cop_pid(mm); + if (ret < 0) + goto out; + + /* update acop */ + mm->context.acop |= acop; + + sync_cop(mm); + + /* + * If this is a threaded process then there might be other threads + * running. We need to send an IPI to force them to pick up any + * change in PID and ACOP. + */ + if (atomic_read(&mm->mm_users) > 1) + smp_call_function(sync_cop, mm, 1); + +out: + spin_unlock(mm->context.cop_lockp); + spin_unlock(&mm->page_table_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(use_cop); + +/** + * Stop using a coprocessor. + * @acop: mask of coprocessor to be stopped. + * @mm: The mm the coprocessor associated with. + */ +void drop_cop(unsigned long acop, struct mm_struct *mm) +{ + int free_pid; + + if (!cpu_has_feature(CPU_FTR_ICSWX)) + return; + + if (WARN_ON_ONCE(!mm)) + return; + + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); + spin_lock(mm->context.cop_lockp); + + mm->context.acop &= ~acop; + + free_pid = disable_cop_pid(mm); + sync_cop(mm); + + /* + * If this is a threaded process then there might be other threads + * running. We need to send an IPI to force them to pick up any + * change in PID and ACOP. + */ + if (atomic_read(&mm->mm_users) > 1) + smp_call_function(sync_cop, mm, 1); + + if (free_pid != COP_PID_NONE) + free_cop_pid(free_pid); + + spin_unlock(mm->context.cop_lockp); + spin_unlock(&mm->page_table_lock); +} +EXPORT_SYMBOL_GPL(drop_cop); + +static int acop_use_cop(int ct) +{ + /* There is no alternate policy, yet */ + return -1; +} + +/* + * Get the instruction word at the NIP + */ +static u32 acop_get_inst(struct pt_regs *regs) +{ + u32 inst; + u32 __user *p; + + p = (u32 __user *)regs->nip; + if (!access_ok(VERIFY_READ, p, sizeof(*p))) + return 0; + + if (__get_user(inst, p)) + return 0; + + return inst; +} + +/** + * @regs: regsiters at time of interrupt + * @address: storage address + * @error_code: Fault code, usually the DSISR or ESR depending on + * processor type + * + * Return 0 if we are able to resolve the data storage fault that + * results from a CT miss in the ACOP register. + */ +int acop_handle_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + int ct; + u32 inst = 0; + + if (!cpu_has_feature(CPU_FTR_ICSWX)) { + pr_info("No coprocessors available"); + _exception(SIGILL, regs, ILL_ILLOPN, address); + } + + if (!user_mode(regs)) { + /* this could happen if the HV denies the + * kernel access, for now we just die */ + die("ICSWX from kernel failed", regs, SIGSEGV); + } + + /* Some implementations leave us a hint for the CT */ + ct = ICSWX_GET_CT_HINT(error_code); + if (ct < 0) { + /* we have to peek at the instruction word to figure out CT */ + u32 ccw; + u32 rs; + + inst = acop_get_inst(regs); + if (inst == 0) + return -1; + + rs = (inst >> (31 - 10)) & 0x1f; + ccw = regs->gpr[rs]; + ct = (ccw >> 16) & 0x3f; + } + + /* + * We could be here because another thread has enabled acop + * but the ACOP register has yet to be updated. + * + * This should have been taken care of by the IPI to sync all + * the threads (see smp_call_function(sync_cop, mm, 1)), but + * that could take forever if there are a significant amount + * of threads. + * + * Given the number of threads on some of these systems, + * perhaps this is the best way to sync ACOP rather than whack + * every thread with an IPI. + */ + if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) { + sync_cop(current->active_mm); + return 0; + } + + /* check for alternate policy */ + if (!acop_use_cop(ct)) + return 0; + + /* at this point the CT is unknown to the system */ + pr_warn("%s[%d]: Coprocessor %d is unavailable\n", + current->comm, current->pid, ct); + + /* get inst if we don't already have it */ + if (inst == 0) { + inst = acop_get_inst(regs); + if (inst == 0) + return -1; + } + + /* Check if the instruction is the "record form" */ + if (inst & 1) { + /* + * the instruction is "record" form so we can reject + * using CR0 + */ + regs->ccr &= ~(0xful << 28); + regs->ccr |= ICSWX_RC_NOT_FOUND << 28; + + /* Move on to the next instruction */ + regs->nip += 4; + } else { + /* + * There is no architected mechanism to report a bad + * CT so we could either SIGILL or report nothing. + * Since the non-record version should only bu used + * for "hints" or "don't care" we should probably do + * nothing. However, I could see how some people + * might want an SIGILL so it here if you want it. + */ +#ifdef CONFIG_PPC_ICSWX_USE_SIGILL + _exception(SIGILL, regs, ILL_ILLOPN, address); +#else + regs->nip += 4; +#endif + } + + return 0; +} +EXPORT_SYMBOL_GPL(acop_handle_fault); diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h new file mode 100644 index 00000000000..6dedc08e62c --- /dev/null +++ b/arch/powerpc/mm/icswx.h @@ -0,0 +1,68 @@ +#ifndef _ARCH_POWERPC_MM_ICSWX_H_ +#define _ARCH_POWERPC_MM_ICSWX_H_ + +/* + * ICSWX and ACOP Management + * + * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/mmu_context.h> + +/* also used to denote that PIDs are not used */ +#define COP_PID_NONE 0 + +static inline void sync_cop(void *arg) +{ + struct mm_struct *mm = arg; + + if (mm == current->active_mm) + switch_cop(current->active_mm); +} + +#ifdef CONFIG_PPC_ICSWX_PID +extern int get_cop_pid(struct mm_struct *mm); +extern int disable_cop_pid(struct mm_struct *mm); +extern void free_cop_pid(int free_pid); +#else +#define get_cop_pid(m) (COP_PID_NONE) +#define disable_cop_pid(m) (COP_PID_NONE) +#define free_cop_pid(p) +#endif + +/* + * These are implementation bits for architected registers. If this + * ever becomes architecture the should be moved to reg.h et. al. + */ +/* UCT is the same bit for Server and Embedded */ +#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */ + +#ifdef CONFIG_PPC_BOOK3E +/* Embedded implementation gives us no hints as to what the CT is */ +#define ICSWX_GET_CT_HINT(x) (-1) +#else +/* Server implementation contains the CT value in the DSISR */ +#define ICSWX_DSISR_CTMASK 0x00003f00 +#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8) +#endif + +#define ICSWX_RC_STARTED 0x8 /* The request has been started */ +#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */ +#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */ +#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */ + +extern int acop_handle_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code); + +static inline u64 acop_copro_type_bit(unsigned int type) +{ + return 1ULL << (63 - type); +} + +#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */ diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c new file mode 100644 index 00000000000..91e30eb7d05 --- /dev/null +++ b/arch/powerpc/mm/icswx_pid.c @@ -0,0 +1,87 @@ +/* + * ICSWX and ACOP/PID Management + * + * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/idr.h> +#include <linux/module.h> +#include "icswx.h" + +#define COP_PID_MIN (COP_PID_NONE + 1) +#define COP_PID_MAX (0xFFFF) + +static DEFINE_SPINLOCK(mmu_context_acop_lock); +static DEFINE_IDA(cop_ida); + +static int new_cop_pid(struct ida *ida, int min_id, int max_id, + spinlock_t *lock) +{ + int index; + int err; + +again: + if (!ida_pre_get(ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(lock); + err = ida_get_new_above(ida, min_id, &index); + spin_unlock(lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > max_id) { + spin_lock(lock); + ida_remove(ida, index); + spin_unlock(lock); + return -ENOMEM; + } + + return index; +} + +int get_cop_pid(struct mm_struct *mm) +{ + int pid; + + if (mm->context.cop_pid == COP_PID_NONE) { + pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX, + &mmu_context_acop_lock); + if (pid >= 0) + mm->context.cop_pid = pid; + } + return mm->context.cop_pid; +} + +int disable_cop_pid(struct mm_struct *mm) +{ + int free_pid = COP_PID_NONE; + + if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) { + free_pid = mm->context.cop_pid; + mm->context.cop_pid = COP_PID_NONE; + } + return free_pid; +} + +void free_cop_pid(int free_pid) +{ + spin_lock(&mmu_context_acop_lock); + ida_remove(&cop_ida, free_pid); + spin_unlock(&mmu_context_acop_lock); +} diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 977cb1ee5e7..cff59f1bec2 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -30,37 +30,48 @@ #include <linux/highmem.h> #include <linux/initrd.h> #include <linux/pagemap.h> +#include <linux/memblock.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/prom.h> #include <asm/io.h> -#include <asm/mmu_context.h> #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/smp.h> #include <asm/machdep.h> #include <asm/btext.h> #include <asm/tlb.h> -#include <asm/lmb.h> #include <asm/sections.h> +#include <asm/hugetlb.h> #include "mmu_decl.h" #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL) -/* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */ -#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - KERNELBASE)) -#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL" +/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */ +#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET)) +#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_KERNEL_START" #endif #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +phys_addr_t total_memory; +phys_addr_t total_lowmem; -unsigned long total_memory; -unsigned long total_lowmem; +phys_addr_t memstart_addr = (phys_addr_t)~0ull; +EXPORT_SYMBOL(memstart_addr); +phys_addr_t kernstart_addr; +EXPORT_SYMBOL(kernstart_addr); -unsigned long ppc_memstart; -unsigned long ppc_memoffset = PAGE_OFFSET; +#ifdef CONFIG_RELOCATABLE_PPC32 +/* Used in __va()/__pa() */ +long long virt_phys_offset; +EXPORT_SYMBOL(virt_phys_offset); +#endif + +phys_addr_t lowmem_end_addr; int boot_mapsize; #ifdef CONFIG_PPC_PMAC @@ -68,21 +79,11 @@ unsigned long agp_special_page; EXPORT_SYMBOL(agp_special_page); #endif -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -pgprot_t kmap_prot; - -EXPORT_SYMBOL(kmap_prot); -EXPORT_SYMBOL(kmap_pte); -#endif - void MMU_init(void); /* XXX should be in current.h -- paulus */ extern struct task_struct *current_set[NR_CPUS]; -extern int init_bootmem_done; - /* * this tells the system to map all of ram with the segregs * (i.e. page tables) instead of the bats. @@ -91,14 +92,13 @@ extern int init_bootmem_done; int __map_without_bats; int __map_without_ltlbs; -/* max amount of low RAM to map in */ -unsigned long __max_low_memory = MAX_LOW_MEM; - /* - * limit of what is accessible with initial MMU setup - - * 256MB usually, but only 16MB on 601. + * This tells the system to allow ioremapping memory marked as reserved. */ -unsigned long __initial_memory_limit = 0x10000000; +int __allow_ioremap_reserved; + +/* max amount of low RAM to map in */ +unsigned long __max_low_memory = MAX_LOW_MEM; /* * Check for command-line options that affect what MMU_init will do. @@ -129,24 +129,26 @@ void __init MMU_init(void) if (ppc_md.progress) ppc_md.progress("MMU:enter", 0x111); - /* 601 can only access 16MB at the moment */ - if (PVR_VER(mfspr(SPRN_PVR)) == 1) - __initial_memory_limit = 0x01000000; - /* 8xx can only access 8MB at the moment */ - if (PVR_VER(mfspr(SPRN_PVR)) == 0x50) - __initial_memory_limit = 0x00800000; - /* parse args from command line */ MMU_setup(); - if (lmb.memory.cnt > 1) { - lmb.memory.cnt = 1; - lmb_analyze(); + /* + * Reserve gigantic pages for hugetlb. This MUST occur before + * lowmem_end_addr is initialized below. + */ + reserve_hugetlb_gpages(); + + if (memblock.memory.cnt > 1) { +#ifndef CONFIG_WII + memblock_enforce_memory_limit(memblock.memory.regions[0].size); printk(KERN_WARNING "Only using first contiguous memory region"); +#else + wii_memory_fixups(); +#endif } - total_memory = lmb_end_of_DRAM(); - total_lowmem = total_memory; + total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr; + lowmem_end_addr = memstart_addr + total_lowmem; #ifdef CONFIG_FSL_BOOKE /* Freescale Book-E parts expect lowmem to be mapped by fixed TLB @@ -157,10 +159,10 @@ void __init MMU_init(void) if (total_lowmem > __max_low_memory) { total_lowmem = __max_low_memory; + lowmem_end_addr = memstart_addr + total_lowmem; #ifndef CONFIG_HIGHMEM total_memory = total_lowmem; - lmb_enforce_memory_limit(total_lowmem); - lmb_analyze(); + memblock_enforce_memory_limit(total_lowmem); #endif /* CONFIG_HIGHMEM */ } @@ -174,21 +176,12 @@ void __init MMU_init(void) ppc_md.progress("MMU:mapin", 0x301); mapin_ram(); -#ifdef CONFIG_HIGHMEM - ioremap_base = PKMAP_BASE; -#else - ioremap_base = 0xfe000000UL; /* for now, could be 0xfffff000 */ -#endif /* CONFIG_HIGHMEM */ - ioremap_bot = ioremap_base; + /* Initialize early top-down ioremap allocator */ + ioremap_bot = IOREMAP_TOP; /* Map in I/O resources */ if (ppc_md.progress) ppc_md.progress("MMU:setio", 0x302); - if (ppc_md.setup_io_mappings) - ppc_md.setup_io_mappings(); - - /* Initialize the context management stuff */ - mmu_context_init(); if (ppc_md.progress) ppc_md.progress("MMU:exit", 0x211); @@ -197,101 +190,35 @@ void __init MMU_init(void) #ifdef CONFIG_BOOTX_TEXT btext_unmap(); #endif + + /* Shortly after that, the entire linear mapping will be available */ + memblock_set_current_limit(lowmem_end_addr); } /* This is only called until mem_init is done. */ void __init *early_get_page(void) { - void *p; - - if (init_bootmem_done) { - p = alloc_bootmem_pages(PAGE_SIZE); - } else { - p = __va(lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, - __initial_memory_limit)); - } - return p; -} - -/* Free up now-unused memory */ -static void free_sec(unsigned long start, unsigned long end, const char *name) -{ - unsigned long cnt = 0; - - while (start < end) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - cnt++; - start += PAGE_SIZE; - } - if (cnt) { - printk(" %ldk %s", cnt << (PAGE_SHIFT - 10), name); - totalram_pages += cnt; - } -} - -void free_initmem(void) -{ -#define FREESEC(TYPE) \ - free_sec((unsigned long)(&__ ## TYPE ## _begin), \ - (unsigned long)(&__ ## TYPE ## _end), \ - #TYPE); - - printk ("Freeing unused kernel memory:"); - FREESEC(init); - printk("\n"); - ppc_md.progress = NULL; -#undef FREESEC + if (init_bootmem_done) + return alloc_bootmem_pages(PAGE_SIZE); + else + return __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE)); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +#ifdef CONFIG_8xx /* No 8xx specific .c file to put that in ... */ +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) { - if (start < end) - printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - totalram_pages++; - } -} + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + +#ifdef CONFIG_PIN_TLB + /* 8xx can only access 24MB at the moment */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000)); +#else + /* 8xx can only access 8MB at the moment */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000)); #endif - -#ifdef CONFIG_PROC_KCORE -static struct kcore_list kcore_vmem; - -static int __init setup_kcore(void) -{ - int i; - - for (i = 0; i < lmb.memory.cnt; i++) { - unsigned long base; - unsigned long size; - struct kcore_list *kcore_mem; - - base = lmb.memory.region[i].base; - size = lmb.memory.region[i].size; - - kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC); - if (!kcore_mem) - panic("%s: kmalloc failed\n", __FUNCTION__); - - /* must stay under 32 bits */ - if ( 0xfffffffful - (unsigned long)__va(base) < size) { - size = 0xfffffffful - (unsigned long)(__va(base)); - printk(KERN_DEBUG "setup_kcore: restrict size=%lx\n", - size); - } - - kclist_add(kcore_mem, __va(base), size); - } - - kclist_add(&kcore_vmem, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - - return 0; } -module_init(setup_kcore); -#endif +#endif /* CONFIG_8xx */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index c0f5cff7703..e3734edffa6 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -19,6 +19,8 @@ * */ +#undef DEBUG + #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -38,11 +40,13 @@ #include <linux/nodemask.h> #include <linux/module.h> #include <linux/poison.h> +#include <linux/memblock.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> #include <asm/pgalloc.h> #include <asm/page.h> #include <asm/prom.h> -#include <asm/lmb.h> #include <asm/rtas.h> #include <asm/io.h> #include <asm/mmu_context.h> @@ -57,13 +61,12 @@ #include <asm/mmzone.h> #include <asm/cputable.h> #include <asm/sections.h> -#include <asm/system.h> #include <asm/iommu.h> -#include <asm/abs_addr.h> #include <asm/vdso.h> #include "mmu_decl.h" +#ifdef CONFIG_PPC_STD_MMU_64 #if PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif @@ -71,109 +74,82 @@ #if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) #warning TASK_SIZE is smaller than it needs to be. #endif +#endif /* CONFIG_PPC_STD_MMU_64 */ -/* max amount of RAM to use */ -unsigned long __max_memory; +phys_addr_t memstart_addr = ~0; +EXPORT_SYMBOL_GPL(memstart_addr); +phys_addr_t kernstart_addr; +EXPORT_SYMBOL_GPL(kernstart_addr); -void free_initmem(void) +static void pgd_ctor(void *addr) { - unsigned long addr; - - addr = (unsigned long)__init_begin; - for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - free_page(addr); - totalram_pages++; - } - printk ("Freeing unused kernel memory: %luk freed\n", - ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10); + memset(addr, 0, PGD_TABLE_SIZE); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +static void pmd_ctor(void *addr) { - if (start < end) - printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - totalram_pages++; - } -} +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + memset(addr, 0, PMD_TABLE_SIZE * 2); +#else + memset(addr, 0, PMD_TABLE_SIZE); #endif - -#ifdef CONFIG_PROC_KCORE -static struct kcore_list kcore_vmem; - -static int __init setup_kcore(void) -{ - int i; - - for (i=0; i < lmb.memory.cnt; i++) { - unsigned long base, size; - struct kcore_list *kcore_mem; - - base = lmb.memory.region[i].base; - size = lmb.memory.region[i].size; - - /* GFP_ATOMIC to avoid might_sleep warnings during boot */ - kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC); - if (!kcore_mem) - panic("%s: kmalloc failed\n", __FUNCTION__); - - kclist_add(kcore_mem, __va(base), size); - } - - kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); - - return 0; } -module_init(setup_kcore); -#endif -static void zero_ctor(struct kmem_cache *cache, void *addr) +struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; + +/* + * Create a kmem_cache() for pagetables. This is not used for PTE + * pages - they're linked to struct page, come from the normal free + * pages pool and have a different entry size (see real_pte_t) to + * everything else. Caches created by this function are used for all + * the higher level pagetables, and for hugepage pagetables. + */ +void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) { - memset(addr, 0, kmem_cache_size(cache)); + char *name; + unsigned long table_size = sizeof(void *) << shift; + unsigned long align = table_size; + + /* When batching pgtable pointers for RCU freeing, we store + * the index size in the low bits. Table alignment must be + * big enough to fit it. + * + * Likewise, hugeapge pagetable pointers contain a (different) + * shift value in the low bits. All tables must be aligned so + * as to leave enough 0 bits in the address to contain it. */ + unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, + HUGEPD_SHIFT_MASK + 1); + struct kmem_cache *new; + + /* It would be nice if this was a BUILD_BUG_ON(), but at the + * moment, gcc doesn't seem to recognize is_power_of_2 as a + * constant expression, so so much for that. */ + BUG_ON(!is_power_of_2(minalign)); + BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE)); + + if (PGT_CACHE(shift)) + return; /* Already have a cache of this size */ + + align = max_t(unsigned long, align, minalign); + name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); + new = kmem_cache_create(name, table_size, align, 0, ctor); + pgtable_cache[shift - 1] = new; + pr_debug("Allocated pgtable cache for order %d\n", shift); } -static const unsigned int pgtable_cache_size[2] = { - PGD_TABLE_SIZE, PMD_TABLE_SIZE -}; -static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { -#ifdef CONFIG_PPC_64K_PAGES - "pgd_cache", "pmd_cache", -#else - "pgd_cache", "pud_pmd_cache", -#endif /* CONFIG_PPC_64K_PAGES */ -}; - -#ifdef CONFIG_HUGETLB_PAGE -/* Hugepages need one extra cache, initialized in hugetlbpage.c. We - * can't put into the tables above, because HPAGE_SHIFT is not compile - * time constant. */ -struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1]; -#else -struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; -#endif void pgtable_cache_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) { - int size = pgtable_cache_size[i]; - const char *name = pgtable_cache_name[i]; - - pr_debug("Allocating page table cache %s (#%d) " - "for size: %08x...\n", name, i, size); - pgtable_cache[i] = kmem_cache_create(name, - size, size, - SLAB_PANIC, - zero_ctor); - } + pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); + pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor); + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX)) + panic("Couldn't allocate pgtable caches"); + /* In all current configs, when the PUD index exists it's the + * same size as either the pgd or pmd index. Verify that the + * initialization above has also created a PUD cache. This + * will need re-examiniation if we add new possibilities for + * the pagetable layout. */ + BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE)); } #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -183,7 +159,7 @@ void pgtable_cache_init(void) * do this by hand as the proffered address may not be correctly aligned. * Subtraction of non-aligned pointers produces undefined results. */ -unsigned long __meminit vmemmap_section_start(unsigned long page) +static unsigned long __meminit vmemmap_section_start(unsigned long page) { unsigned long offset = page - ((unsigned long)(vmemmap)); @@ -196,7 +172,7 @@ unsigned long __meminit vmemmap_section_start(unsigned long page) * which overlaps this vmemmap page is initialised then this page is * initialised already. */ -int __meminit vmemmap_populated(unsigned long start, int page_size) +static int __meminit vmemmap_populated(unsigned long start, int page_size) { unsigned long end = start + page_size; @@ -207,21 +183,99 @@ int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } -int __meminit vmemmap_populate(struct page *start_page, - unsigned long nr_pages, int node) +/* On hash-based CPUs, the vmemmap is bolted in the hash table. + * + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ + +#ifdef CONFIG_PPC_BOOK3E +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); +} +#else /* CONFIG_PPC_BOOK3E */ +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int mapped = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(mapped < 0); +} +#endif /* CONFIG_PPC_BOOK3E */ + +struct vmemmap_backing *vmemmap_list; + +static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node) { - unsigned long mode_rw; - unsigned long start = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + nr_pages); - unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift; + static struct vmemmap_backing *next; + static int num_left; + + /* allocate a page when required and hand out chunks */ + if (!next || !num_left) { + next = vmemmap_alloc_block(PAGE_SIZE, node); + if (unlikely(!next)) { + WARN_ON(1); + return NULL; + } + num_left = PAGE_SIZE / sizeof(struct vmemmap_backing); + } - mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX; + num_left--; + + return next++; +} + +static __meminit void vmemmap_list_populate(unsigned long phys, + unsigned long start, + int node) +{ + struct vmemmap_backing *vmem_back; + + vmem_back = vmemmap_list_alloc(node); + if (unlikely(!vmem_back)) { + WARN_ON(1); + return; + } + + vmem_back->phys = phys; + vmem_back->virt_addr = start; + vmem_back->list = vmemmap_list; + + vmemmap_list = vmem_back; +} + +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ + unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; /* Align to the page size of the linear mapping. */ start = _ALIGN_DOWN(start, page_size); + pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); + for (; start < end; start += page_size) { - int mapped; void *p; if (vmemmap_populated(start, page_size)) @@ -231,15 +285,73 @@ int __meminit vmemmap_populate(struct page *start_page, if (!p) return -ENOMEM; - pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n", - start, p, __pa(p)); + vmemmap_list_populate(__pa(p), start, node); - mapped = htab_bolt_mapping(start, start + page_size, - __pa(p), mode_rw, mmu_linear_psize, - mmu_kernel_ssize); - BUG_ON(mapped < 0); + pr_debug(" * %016lx..%016lx allocated at %p\n", + start, start + page_size, p); + + vmemmap_create_mapping(start, page_size, __pa(p)); } return 0; } -#endif + +void vmemmap_free(unsigned long start, unsigned long end) +{ +} + +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long size) +{ +} + +/* + * We do not have access to the sparsemem vmemmap, so we fallback to + * walking the list of sparsemem blocks which we already maintain for + * the sake of crashdump. In the long run, we might want to maintain + * a tree if performance of that linear walk becomes a problem. + * + * realmode_pfn_to_page functions can fail due to: + * 1) As real sparsemem blocks do not lay in RAM continously (they + * are in virtual address space which is not available in the real mode), + * the requested page struct can be split between blocks so get_page/put_page + * may fail. + * 2) When huge pages are used, the get_page/put_page API will fail + * in real mode as the linked addresses in the page struct are virtual + * too. + */ +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct vmemmap_backing *vmem_back; + struct page *page; + unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long pg_va = (unsigned long) pfn_to_page(pfn); + + for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) { + if (pg_va < vmem_back->virt_addr) + continue; + + /* Check that page struct is not split between real pages */ + if ((pg_va + sizeof(struct page)) > + (vmem_back->virt_addr + page_size)) + return NULL; + + page = (struct page *) (vmem_back->phys + pg_va - + vmem_back->virt_addr); + return page; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#elif defined(CONFIG_FLATMEM) + +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct page *page = pfn_to_page(pfn); + return page; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ diff --git a/arch/powerpc/mm/lmb.c b/arch/powerpc/mm/lmb.c deleted file mode 100644 index 4ce23bcf8a5..00000000000 --- a/arch/powerpc/mm/lmb.c +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Procedures for maintaining information about logical memory blocks. - * - * Peter Bergner, IBM Corp. June 2001. - * Copyright (C) 2001 Peter Bergner. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <asm/types.h> -#include <asm/page.h> -#include <asm/prom.h> -#include <asm/lmb.h> -#ifdef CONFIG_PPC32 -#include "mmu_decl.h" /* for __max_low_memory */ -#endif - -#undef DEBUG - -#ifdef DEBUG -#include <asm/udbg.h> -#define DBG(fmt...) udbg_printf(fmt) -#else -#define DBG(fmt...) -#endif - -#define LMB_ALLOC_ANYWHERE 0 - -struct lmb lmb; - -void lmb_dump_all(void) -{ -#ifdef DEBUG - unsigned long i; - - DBG("lmb_dump_all:\n"); - DBG(" memory.cnt = 0x%lx\n", lmb.memory.cnt); - DBG(" memory.size = 0x%lx\n", lmb.memory.size); - for (i=0; i < lmb.memory.cnt ;i++) { - DBG(" memory.region[0x%x].base = 0x%lx\n", - i, lmb.memory.region[i].base); - DBG(" .size = 0x%lx\n", - lmb.memory.region[i].size); - } - - DBG("\n reserved.cnt = 0x%lx\n", lmb.reserved.cnt); - DBG(" reserved.size = 0x%lx\n", lmb.reserved.size); - for (i=0; i < lmb.reserved.cnt ;i++) { - DBG(" reserved.region[0x%x].base = 0x%lx\n", - i, lmb.reserved.region[i].base); - DBG(" .size = 0x%lx\n", - lmb.reserved.region[i].size); - } -#endif /* DEBUG */ -} - -static unsigned long __init lmb_addrs_overlap(unsigned long base1, - unsigned long size1, unsigned long base2, unsigned long size2) -{ - return ((base1 < (base2+size2)) && (base2 < (base1+size1))); -} - -static long __init lmb_addrs_adjacent(unsigned long base1, unsigned long size1, - unsigned long base2, unsigned long size2) -{ - if (base2 == base1 + size1) - return 1; - else if (base1 == base2 + size2) - return -1; - - return 0; -} - -static long __init lmb_regions_adjacent(struct lmb_region *rgn, - unsigned long r1, unsigned long r2) -{ - unsigned long base1 = rgn->region[r1].base; - unsigned long size1 = rgn->region[r1].size; - unsigned long base2 = rgn->region[r2].base; - unsigned long size2 = rgn->region[r2].size; - - return lmb_addrs_adjacent(base1, size1, base2, size2); -} - -static void __init lmb_remove_region(struct lmb_region *rgn, unsigned long r) -{ - unsigned long i; - - for (i = r; i < rgn->cnt - 1; i++) { - rgn->region[i].base = rgn->region[i + 1].base; - rgn->region[i].size = rgn->region[i + 1].size; - } - rgn->cnt--; -} - -/* Assumption: base addr of region 1 < base addr of region 2 */ -static void __init lmb_coalesce_regions(struct lmb_region *rgn, - unsigned long r1, unsigned long r2) -{ - rgn->region[r1].size += rgn->region[r2].size; - lmb_remove_region(rgn, r2); -} - -/* This routine called with relocation disabled. */ -void __init lmb_init(void) -{ - /* Create a dummy zero size LMB which will get coalesced away later. - * This simplifies the lmb_add() code below... - */ - lmb.memory.region[0].base = 0; - lmb.memory.region[0].size = 0; - lmb.memory.cnt = 1; - - /* Ditto. */ - lmb.reserved.region[0].base = 0; - lmb.reserved.region[0].size = 0; - lmb.reserved.cnt = 1; -} - -/* This routine may be called with relocation disabled. */ -void __init lmb_analyze(void) -{ - int i; - - lmb.memory.size = 0; - - for (i = 0; i < lmb.memory.cnt; i++) - lmb.memory.size += lmb.memory.region[i].size; -} - -/* This routine called with relocation disabled. */ -static long __init lmb_add_region(struct lmb_region *rgn, unsigned long base, - unsigned long size) -{ - unsigned long coalesced = 0; - long adjacent, i; - - /* First try and coalesce this LMB with another. */ - for (i=0; i < rgn->cnt; i++) { - unsigned long rgnbase = rgn->region[i].base; - unsigned long rgnsize = rgn->region[i].size; - - if ((rgnbase == base) && (rgnsize == size)) - /* Already have this region, so we're done */ - return 0; - - adjacent = lmb_addrs_adjacent(base,size,rgnbase,rgnsize); - if ( adjacent > 0 ) { - rgn->region[i].base -= size; - rgn->region[i].size += size; - coalesced++; - break; - } - else if ( adjacent < 0 ) { - rgn->region[i].size += size; - coalesced++; - break; - } - } - - if ((i < rgn->cnt-1) && lmb_regions_adjacent(rgn, i, i+1) ) { - lmb_coalesce_regions(rgn, i, i+1); - coalesced++; - } - - if (coalesced) - return coalesced; - if (rgn->cnt >= MAX_LMB_REGIONS) - return -1; - - /* Couldn't coalesce the LMB, so add it to the sorted table. */ - for (i = rgn->cnt-1; i >= 0; i--) { - if (base < rgn->region[i].base) { - rgn->region[i+1].base = rgn->region[i].base; - rgn->region[i+1].size = rgn->region[i].size; - } else { - rgn->region[i+1].base = base; - rgn->region[i+1].size = size; - break; - } - } - rgn->cnt++; - - return 0; -} - -/* This routine may be called with relocation disabled. */ -long __init lmb_add(unsigned long base, unsigned long size) -{ - struct lmb_region *_rgn = &(lmb.memory); - - /* On pSeries LPAR systems, the first LMB is our RMO region. */ - if (base == 0) - lmb.rmo_size = size; - - return lmb_add_region(_rgn, base, size); - -} - -long __init lmb_reserve(unsigned long base, unsigned long size) -{ - struct lmb_region *_rgn = &(lmb.reserved); - - BUG_ON(0 == size); - - return lmb_add_region(_rgn, base, size); -} - -long __init lmb_overlaps_region(struct lmb_region *rgn, unsigned long base, - unsigned long size) -{ - unsigned long i; - - for (i=0; i < rgn->cnt; i++) { - unsigned long rgnbase = rgn->region[i].base; - unsigned long rgnsize = rgn->region[i].size; - if ( lmb_addrs_overlap(base,size,rgnbase,rgnsize) ) { - break; - } - } - - return (i < rgn->cnt) ? i : -1; -} - -unsigned long __init lmb_alloc(unsigned long size, unsigned long align) -{ - return lmb_alloc_base(size, align, LMB_ALLOC_ANYWHERE); -} - -unsigned long __init lmb_alloc_base(unsigned long size, unsigned long align, - unsigned long max_addr) -{ - unsigned long alloc; - - alloc = __lmb_alloc_base(size, align, max_addr); - - if (alloc == 0) - panic("ERROR: Failed to allocate 0x%lx bytes below 0x%lx.\n", - size, max_addr); - - return alloc; -} - -unsigned long __init __lmb_alloc_base(unsigned long size, unsigned long align, - unsigned long max_addr) -{ - long i, j; - unsigned long base = 0; - - BUG_ON(0 == size); - -#ifdef CONFIG_PPC32 - /* On 32-bit, make sure we allocate lowmem */ - if (max_addr == LMB_ALLOC_ANYWHERE) - max_addr = __max_low_memory; -#endif - for (i = lmb.memory.cnt-1; i >= 0; i--) { - unsigned long lmbbase = lmb.memory.region[i].base; - unsigned long lmbsize = lmb.memory.region[i].size; - - if (max_addr == LMB_ALLOC_ANYWHERE) - base = _ALIGN_DOWN(lmbbase + lmbsize - size, align); - else if (lmbbase < max_addr) { - base = min(lmbbase + lmbsize, max_addr); - base = _ALIGN_DOWN(base - size, align); - } else - continue; - - while ((lmbbase <= base) && - ((j = lmb_overlaps_region(&lmb.reserved, base, size)) >= 0) ) - base = _ALIGN_DOWN(lmb.reserved.region[j].base - size, - align); - - if ((base != 0) && (lmbbase <= base)) - break; - } - - if (i < 0) - return 0; - - lmb_add_region(&lmb.reserved, base, size); - - return base; -} - -/* You must call lmb_analyze() before this. */ -unsigned long __init lmb_phys_mem_size(void) -{ - return lmb.memory.size; -} - -unsigned long __init lmb_end_of_DRAM(void) -{ - int idx = lmb.memory.cnt - 1; - - return (lmb.memory.region[idx].base + lmb.memory.region[idx].size); -} - -/* You must call lmb_analyze() after this. */ -void __init lmb_enforce_memory_limit(unsigned long memory_limit) -{ - unsigned long i, limit; - struct lmb_property *p; - - if (! memory_limit) - return; - - /* Truncate the lmb regions to satisfy the memory limit. */ - limit = memory_limit; - for (i = 0; i < lmb.memory.cnt; i++) { - if (limit > lmb.memory.region[i].size) { - limit -= lmb.memory.region[i].size; - continue; - } - - lmb.memory.region[i].size = limit; - lmb.memory.cnt = i + 1; - break; - } - - if (lmb.memory.region[0].size < lmb.rmo_size) - lmb.rmo_size = lmb.memory.region[0].size; - - /* And truncate any reserves above the limit also. */ - for (i = 0; i < lmb.reserved.cnt; i++) { - p = &lmb.reserved.region[i]; - - if (p->base > memory_limit) - p->size = 0; - else if ((p->base + p->size) > memory_limit) - p->size = memory_limit - p->base; - - if (p->size == 0) { - lmb_remove_region(&lmb.reserved, i); - i--; - } - } -} - -int __init lmb_is_reserved(unsigned long addr) -{ - int i; - - for (i = 0; i < lmb.reserved.cnt; i++) { - unsigned long upper = lmb.reserved.region[i].base + - lmb.reserved.region[i].size - 1; - if ((addr >= lmb.reserved.region[i].base) && (addr <= upper)) - return 1; - } - return 0; -} diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index be5c506779a..2c8e90f5789 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -17,11 +17,12 @@ * */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> +#include <linux/gfp.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/stddef.h> @@ -31,6 +32,9 @@ #include <linux/initrd.h> #include <linux/pagemap.h> #include <linux/suspend.h> +#include <linux/memblock.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> #include <asm/pgalloc.h> #include <asm/prom.h> @@ -42,9 +46,12 @@ #include <asm/machdep.h> #include <asm/btext.h> #include <asm/tlb.h> -#include <asm/lmb.h> #include <asm/sections.h> +#include <asm/sparsemem.h> #include <asm/vdso.h> +#include <asm/fixmap.h> +#include <asm/swiotlb.h> +#include <asm/rtas.h> #include "mmu_decl.h" @@ -55,27 +62,32 @@ int init_bootmem_done; int mem_init_done; -unsigned long memory_limit; +unsigned long long memory_limit; -int page_is_ram(unsigned long pfn) +#ifdef CONFIG_HIGHMEM +pte_t *kmap_pte; +EXPORT_SYMBOL(kmap_pte); +pgprot_t kmap_prot; +EXPORT_SYMBOL(kmap_prot); + +static inline pte_t *virt_to_kpte(unsigned long vaddr) { - unsigned long paddr = (pfn << PAGE_SHIFT); + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), + vaddr), vaddr), vaddr); +} +#endif +int page_is_ram(unsigned long pfn) +{ #ifndef CONFIG_PPC64 /* XXX for now */ - return paddr < __pa(high_memory); + return pfn < max_pfn; #else - int i; - for (i=0; i < lmb.memory.cnt; i++) { - unsigned long base; - - base = lmb.memory.region[i].base; + unsigned long paddr = (pfn << PAGE_SHIFT); + struct memblock_region *reg; - if ((paddr >= base) && - (paddr < (base + lmb.memory.region[i].size))) { + for_each_memblock(memory, reg) + if (paddr >= reg->base && paddr < (reg->base + reg->size)) return 1; - } - } - return 0; #endif } @@ -87,23 +99,14 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot); if (!page_is_ram(pfn)) - vma_prot = __pgprot(pgprot_val(vma_prot) - | _PAGE_GUARDED | _PAGE_NO_CACHE); + vma_prot = pgprot_noncached(vma_prot); + return vma_prot; } EXPORT_SYMBOL(phys_mem_access_prot); #ifdef CONFIG_MEMORY_HOTPLUG -void online_page(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalram_pages++; - num_physpages++; -} - #ifdef CONFIG_NUMA int memory_add_physaddr_to_nid(u64 start) { @@ -111,7 +114,7 @@ int memory_add_physaddr_to_nid(u64 start) } #endif -int __devinit arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdata; struct zone *zone; @@ -121,88 +124,60 @@ int __devinit arch_add_memory(int nid, u64 start, u64 size) pgdata = NODE_DATA(nid); start = (unsigned long)__va(start); - create_section_mapping(start, start + size); + if (create_section_mapping(start, start + size)) + return -EINVAL; /* this should work for most non-highmem platforms */ zone = pgdata->node_zones; - return __add_pages(zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages); } #ifdef CONFIG_MEMORY_HOTREMOVE -int remove_memory(u64 start, u64 size) +int arch_remove_memory(u64 start, u64 size) { - unsigned long start_pfn, end_pfn; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; int ret; - start_pfn = start >> PAGE_SHIFT; - end_pfn = start_pfn + (size >> PAGE_SHIFT); - ret = offline_pages(start_pfn, end_pfn, 120 * HZ); - if (ret) - goto out; - /* Arch-specific calls go here - next patch */ -out: + zone = page_zone(pfn_to_page(start_pfn)); + ret = __remove_pages(zone, start_pfn, nr_pages); + if (!ret && (ppc_md.remove_memory)) + ret = ppc_md.remove_memory(start, size); + return ret; } -#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif +#endif /* CONFIG_MEMORY_HOTPLUG */ /* * walk_memory_resource() needs to make sure there is no holes in a given - * memory range. On PPC64, since this range comes from /sysfs, the range - * is guaranteed to be valid, non-overlapping and can not contain any - * holes. By the time we get here (memory add or remove), /proc/device-tree - * is updated and correct. Only reason we need to check against device-tree - * would be if we allow user-land to specify a memory range through a - * system call/ioctl etc. instead of doing offline/online through /sysfs. + * memory range. PPC64 does not maintain the memory layout in /proc/iomem. + * Instead it maintains it in memblock.memory structures. Walk through the + * memory regions, find holes and callback for contiguous regions. */ int -walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, - int (*func)(unsigned long, unsigned long, void *)) +walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg, int (*func)(unsigned long, unsigned long, void *)) { - return (*func)(start_pfn, nr_pages, arg); -} - -#endif /* CONFIG_MEMORY_HOTPLUG */ - -void show_mem(void) -{ - unsigned long total = 0, reserved = 0; - unsigned long shared = 0, cached = 0; - unsigned long highmem = 0; - struct page *page; - pg_data_t *pgdat; - unsigned long i; - - printk("Mem-info:\n"); - show_free_areas(); - printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_online_pgdat(pgdat) { - unsigned long flags; - pgdat_resize_lock(pgdat, &flags); - for (i = 0; i < pgdat->node_spanned_pages; i++) { - if (!pfn_valid(pgdat->node_start_pfn + i)) - continue; - page = pgdat_page_nr(pgdat, i); - total++; - if (PageHighMem(page)) - highmem++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - pgdat_resize_unlock(pgdat, &flags); + struct memblock_region *reg; + unsigned long end_pfn = start_pfn + nr_pages; + unsigned long tstart, tend; + int ret = -1; + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart >= tend) + continue; + ret = (*func)(tstart, tend - tstart, arg); + if (ret) + break; } - printk("%ld pages of RAM\n", total); -#ifdef CONFIG_HIGHMEM - printk("%ld pages of HIGHMEM\n", highmem); -#endif - printk("%ld reserved pages\n", reserved); - printk("%ld pages shared\n", shared); - printk("%ld pages swap cached\n", cached); + return ret; } +EXPORT_SYMBOL_GPL(walk_system_ram_range); /* * Initialize the bootmem system and give it all the memory we @@ -212,14 +187,16 @@ void show_mem(void) #ifndef CONFIG_NEED_MULTIPLE_NODES void __init do_init_bootmem(void) { - unsigned long i; unsigned long start, bootmap_pages; unsigned long total_pages; + struct memblock_region *reg; int boot_mapsize; - max_pfn = total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT; + max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; + total_pages = (memblock_end_of_DRAM() - memstart_addr) >> PAGE_SHIFT; #ifdef CONFIG_HIGHMEM total_pages = total_lowmem >> PAGE_SHIFT; + max_low_pfn = lowmem_end_addr >> PAGE_SHIFT; #endif /* @@ -229,48 +206,38 @@ void __init do_init_bootmem(void) */ bootmap_pages = bootmem_bootmap_pages(total_pages); - start = lmb_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE); + start = memblock_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE); - boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages); + min_low_pfn = MEMORY_START >> PAGE_SHIFT; + boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn); - /* Add active regions with valid PFNs */ - for (i = 0; i < lmb.memory.cnt; i++) { - unsigned long start_pfn, end_pfn; - start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; - end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i); - add_active_range(0, start_pfn, end_pfn); - } + /* Place all memblock_regions in the same node and merge contiguous + * memblock_regions + */ + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); /* Add all physical memory to the bootmem map, mark each area * present. */ #ifdef CONFIG_HIGHMEM - free_bootmem_with_active_regions(0, total_lowmem >> PAGE_SHIFT); + free_bootmem_with_active_regions(0, lowmem_end_addr >> PAGE_SHIFT); /* reserve the sections we're already using */ - for (i = 0; i < lmb.reserved.cnt; i++) { - unsigned long addr = lmb.reserved.region[i].base + - lmb_size_bytes(&lmb.reserved, i) - 1; - if (addr < total_lowmem) - reserve_bootmem(lmb.reserved.region[i].base, - lmb_size_bytes(&lmb.reserved, i), - BOOTMEM_DEFAULT); - else if (lmb.reserved.region[i].base < total_lowmem) { - unsigned long adjusted_size = total_lowmem - - lmb.reserved.region[i].base; - reserve_bootmem(lmb.reserved.region[i].base, - adjusted_size, BOOTMEM_DEFAULT); + for_each_memblock(reserved, reg) { + unsigned long top = reg->base + reg->size - 1; + if (top < lowmem_end_addr) + reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT); + else if (reg->base < lowmem_end_addr) { + unsigned long trunc_size = lowmem_end_addr - reg->base; + reserve_bootmem(reg->base, trunc_size, BOOTMEM_DEFAULT); } } #else free_bootmem_with_active_regions(0, max_pfn); /* reserve the sections we're already using */ - for (i = 0; i < lmb.reserved.cnt; i++) - reserve_bootmem(lmb.reserved.region[i].base, - lmb_size_bytes(&lmb.reserved, i), - BOOTMEM_DEFAULT); - + for_each_memblock(reserved, reg) + reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT); #endif /* XXX need to clip this if using highmem? */ sparse_memory_present_with_active_regions(0); @@ -281,22 +248,15 @@ void __init do_init_bootmem(void) /* mark pages that don't exist as nosave */ static int __init mark_nonram_nosave(void) { - unsigned long lmb_next_region_start_pfn, - lmb_region_max_pfn; - int i; - - for (i = 0; i < lmb.memory.cnt - 1; i++) { - lmb_region_max_pfn = - (lmb.memory.region[i].base >> PAGE_SHIFT) + - (lmb.memory.region[i].size >> PAGE_SHIFT); - lmb_next_region_start_pfn = - lmb.memory.region[i+1].base >> PAGE_SHIFT; - - if (lmb_region_max_pfn < lmb_next_region_start_pfn) - register_nosave_region(lmb_region_max_pfn, - lmb_next_region_start_pfn); + struct memblock_region *reg, *prev = NULL; + + for_each_memblock(memory, reg) { + if (prev && + memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg)) + register_nosave_region(memblock_region_memory_end_pfn(prev), + memblock_region_memory_base_pfn(reg)); + prev = reg; } - return 0; } @@ -305,28 +265,33 @@ static int __init mark_nonram_nosave(void) */ void __init paging_init(void) { - unsigned long total_ram = lmb_phys_mem_size(); - unsigned long top_of_ram = lmb_end_of_DRAM(); + unsigned long long total_ram = memblock_phys_mem_size(); + phys_addr_t top_of_ram = memblock_end_of_DRAM(); unsigned long max_zone_pfns[MAX_NR_ZONES]; +#ifdef CONFIG_PPC32 + unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); + unsigned long end = __fix_to_virt(FIX_HOLE); + + for (; v < end; v += PAGE_SIZE) + map_page(v, 0, 0); /* XXX gross */ +#endif + #ifdef CONFIG_HIGHMEM map_page(PKMAP_BASE, 0, 0); /* XXX gross */ - pkmap_page_table = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k - (PKMAP_BASE), PKMAP_BASE), PKMAP_BASE), PKMAP_BASE); - map_page(KMAP_FIX_BEGIN, 0, 0); /* XXX gross */ - kmap_pte = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k - (KMAP_FIX_BEGIN), KMAP_FIX_BEGIN), KMAP_FIX_BEGIN), - KMAP_FIX_BEGIN); + pkmap_page_table = virt_to_kpte(PKMAP_BASE); + + kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); kmap_prot = PAGE_KERNEL; #endif /* CONFIG_HIGHMEM */ - printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", - top_of_ram, total_ram); + printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", + (unsigned long long)top_of_ram, total_ram); printk(KERN_DEBUG "Memory hole size: %ldMB\n", - (top_of_ram - total_ram) >> 20); + (long int)((top_of_ram - total_ram) >> 20)); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_DMA] = total_lowmem >> PAGE_SHIFT; + max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT; max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT; #else max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; @@ -337,80 +302,88 @@ void __init paging_init(void) } #endif /* ! CONFIG_NEED_MULTIPLE_NODES */ -void __init mem_init(void) +static void __init register_page_bootmem_info(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES - int nid; -#endif - pg_data_t *pgdat; - unsigned long i; - struct page *page; - unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; + int i; - num_physpages = lmb.memory.size >> PAGE_SHIFT; - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +} -#ifdef CONFIG_NEED_MULTIPLE_NODES - for_each_online_node(nid) { - if (NODE_DATA(nid)->node_spanned_pages != 0) { - printk("freeing bootmem node %d\n", nid); - totalram_pages += - free_all_bootmem_node(NODE_DATA(nid)); - } - } -#else - max_mapnr = max_pfn; - totalram_pages += free_all_bootmem(); +void __init mem_init(void) +{ + /* + * book3s is limited to 16 page sizes due to encoding this in + * a 4-bit field for slices. + */ + BUILD_BUG_ON(MMU_PAGE_COUNT > 16); + +#ifdef CONFIG_SWIOTLB + swiotlb_init(0); #endif - for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; i++) { - if (!pfn_valid(pgdat->node_start_pfn + i)) - continue; - page = pgdat_page_nr(pgdat, i); - if (PageReserved(page)) - reservedpages++; - } - } - codesize = (unsigned long)&_sdata - (unsigned long)&_stext; - datasize = (unsigned long)&_edata - (unsigned long)&_sdata; - initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin; - bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start; + register_page_bootmem_info(); + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + set_max_mapnr(max_pfn); + free_all_bootmem(); #ifdef CONFIG_HIGHMEM { unsigned long pfn, highmem_mapnr; - highmem_mapnr = total_lowmem >> PAGE_SHIFT; + highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT; for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { + phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT; struct page *page = pfn_to_page(pfn); - if (lmb_is_reserved(pfn << PAGE_SHIFT)) - continue; - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalhigh_pages++; - reservedpages--; + if (!memblock_is_reserved(paddr)) + free_highmem_page(page); } - totalram_pages += totalhigh_pages; - printk(KERN_DEBUG "High memory: %luk\n", - totalhigh_pages << (PAGE_SHIFT-10)); } #endif /* CONFIG_HIGHMEM */ - printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, " - "%luk reserved, %luk data, %luk bss, %luk init)\n", - (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - bsssize >> 10, - initsize >> 10); +#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP) + /* + * If smp is enabled, next_tlbcam_idx is initialized in the cpu up + * functions.... do it here for the non-smp case. + */ + per_cpu(next_tlbcam_idx, smp_processor_id()) = + (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; +#endif + + mem_init_print_info(NULL); +#ifdef CONFIG_PPC32 + pr_info("Kernel virtual memory layout:\n"); + pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP); +#ifdef CONFIG_HIGHMEM + pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", + PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); +#endif /* CONFIG_HIGHMEM */ +#ifdef CONFIG_NOT_COHERENT_CACHE + pr_info(" * 0x%08lx..0x%08lx : consistent mem\n", + IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE); +#endif /* CONFIG_NOT_COHERENT_CACHE */ + pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", + ioremap_bot, IOREMAP_TOP); + pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", + VMALLOC_START, VMALLOC_END); +#endif /* CONFIG_PPC32 */ mem_init_done = 1; } +void free_initmem(void) +{ + ppc_md.progress = ppc_printk_progress; + free_initmem_default(POISON_FREE_INITMEM); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void __init free_initrd_mem(unsigned long start, unsigned long end) +{ + free_reserved_area((void *)start, (void *)end, -1, "initrd"); +} +#endif + /* * This is called when a page has been modified by the kernel. * It just marks the page as not i-cache clean. We do the i-cache @@ -428,24 +401,33 @@ EXPORT_SYMBOL(flush_dcache_page); void flush_dcache_icache_page(struct page *page) { +#ifdef CONFIG_HUGETLB_PAGE + if (PageCompound(page)) { + flush_dcache_icache_hugepage(page); + return; + } +#endif #ifdef CONFIG_BOOKE - void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); - __flush_dcache_icache(start); - kunmap_atomic(start, KM_PPC_SYNC_ICACHE); + { + void *start = kmap_atomic(page); + __flush_dcache_icache(start); + kunmap_atomic(start); + } #elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) /* On 8xx there is no need to kmap since highmem is not supported */ __flush_dcache_icache(page_address(page)); #else __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); #endif - } +EXPORT_SYMBOL(flush_dcache_icache_page); + void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); /* - * We shouldnt have to do this, but some versions of glibc + * We shouldn't have to do this, but some versions of glibc * require it (ld.so assumes zero filled pages are icache clean) * - Anton */ @@ -496,46 +478,17 @@ EXPORT_SYMBOL(flush_icache_user_range); * This must always be called with the pte lock held. */ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t pte) + pte_t *ptep) { #ifdef CONFIG_PPC_STD_MMU + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ unsigned long access = 0, trap; -#endif - unsigned long pfn = pte_pfn(pte); - - /* handle i-cache coherency */ - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) && - !cpu_has_feature(CPU_FTR_NOEXECUTE) && - pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); -#ifdef CONFIG_8xx - /* On 8xx, cache control instructions (particularly - * "dcbst" from flush_dcache_icache) fault as write - * operation if there is an unpopulated TLB entry - * for the address in question. To workaround that, - * we invalidate the TLB here, thus avoiding dcbst - * misbehaviour. - */ - _tlbie(address, 0 /* 8xx doesn't care about PID */); -#endif - /* The _PAGE_USER test should really be _PAGE_EXEC, but - * older glibc versions execute some code from no-exec - * pages, which for now we are supporting. If exec-only - * pages are ever implemented, this will have to change. - */ - if (!PageReserved(page) && (pte_val(pte) & _PAGE_USER) - && !test_bit(PG_arch_1, &page->flags)) { - if (vma->vm_mm == current->active_mm) { - __flush_dcache_icache((void *) address); - } else - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } - } -#ifdef CONFIG_PPC_STD_MMU /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ - if (!pte_young(pte) || address >= TASK_SIZE) + if (!pte_young(*ptep) || address >= TASK_SIZE) return; /* We try to figure out if we are coming from an instruction @@ -554,4 +507,58 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, return; hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ +#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ + && defined(CONFIG_HUGETLB_PAGE) + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma, address, *ptep); +#endif +} + +/* + * System memory should not be in /proc/iomem but various tools expect it + * (eg kdump). + */ +static int __init add_system_ram_resources(void) +{ + struct memblock_region *reg; + + for_each_memblock(memory, reg) { + struct resource *res; + unsigned long base = reg->base; + unsigned long size = reg->size; + + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + WARN_ON(!res); + + if (res) { + res->name = "System RAM"; + res->start = base; + res->end = base + size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + WARN_ON(request_resource(&iomem_resource, res) < 0); + } + } + + return 0; +} +subsys_initcall(add_system_ram_resources); + +#ifdef CONFIG_STRICT_DEVMEM +/* + * devmem_is_allowed(): check to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * Access has to be given to non-kernel-ram areas as well, these contain the + * PCI mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pfn) +{ + if (iomem_is_exclusive(pfn << PAGE_SHIFT)) + return 0; + if (!page_is_ram(pfn)) + return 1; + if (page_is_rtas_user_buf(pfn)) + return 1; + return 0; } +#endif /* CONFIG_STRICT_DEVMEM */ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 86010fc7d3b..cb8bdbe4972 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -24,43 +24,59 @@ #include <linux/personality.h> #include <linux/mm.h> +#include <linux/random.h> #include <linux/sched.h> /* * Top of mmap area (just below the process stack). * - * Leave an at least ~128 MB hole. + * Leave at least a ~128 MB hole on 32bit applications. + * + * On 64bit applications we randomise the stack by 1GB so we need to + * space our mmap start address by a further 1GB, otherwise there is a + * chance the mmap area will end up closer to the stack than our ulimit + * requires. */ -#define MIN_GAP (128*1024*1024) +#define MIN_GAP32 (128*1024*1024) +#define MIN_GAP64 ((128 + 1024)*1024*1024UL) +#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64) #define MAX_GAP (TASK_SIZE/6*5) -static inline unsigned long mmap_base(void) +static inline int mmap_is_legacy(void) { - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; + if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + return 1; - return TASK_SIZE - (gap & PAGE_MASK); + return sysctl_legacy_va_layout; } -static inline int mmap_is_legacy(void) +static unsigned long mmap_rnd(void) { - /* - * Force standard allocation for 64 bit programs. - */ - if (!test_thread_flag(TIF_32BIT)) - return 1; + unsigned long rnd = 0; - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; + if (current->flags & PF_RANDOMIZE) { + /* 8MB for 32bit, 1GB for 64bit */ + if (is_32bit_task()) + rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT))); + else + rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT))); + } + return rnd << PAGE_SHIFT; +} - if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) - return 1; +static inline unsigned long mmap_base(void) +{ + unsigned long gap = rlimit(RLIMIT_STACK); - return sysctl_legacy_va_layout; + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); } /* @@ -76,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/powerpc/mm/mmu_context_32.c b/arch/powerpc/mm/mmu_context_32.c deleted file mode 100644 index cc32ba41d90..00000000000 --- a/arch/powerpc/mm/mmu_context_32.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * This file contains the routines for handling the MMU on those - * PowerPC implementations where the MMU substantially follows the - * architecture specification. This includes the 6xx, 7xx, 7xxx, - * 8260, and POWER3 implementations but excludes the 8xx and 4xx. - * -- paulus - * - * Derived from arch/ppc/mm/init.c: - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/mm.h> -#include <linux/init.h> - -#include <asm/mmu_context.h> -#include <asm/tlbflush.h> - -unsigned long next_mmu_context; -unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1]; -#ifdef FEW_CONTEXTS -atomic_t nr_free_contexts; -struct mm_struct *context_mm[LAST_CONTEXT+1]; -void steal_context(void); -#endif /* FEW_CONTEXTS */ - -/* - * Initialize the context management stuff. - */ -void __init -mmu_context_init(void) -{ - /* - * Some processors have too few contexts to reserve one for - * init_mm, and require using context 0 for a normal task. - * Other processors reserve the use of context zero for the kernel. - * This code assumes FIRST_CONTEXT < 32. - */ - context_map[0] = (1 << FIRST_CONTEXT) - 1; - next_mmu_context = FIRST_CONTEXT; -#ifdef FEW_CONTEXTS - atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1); -#endif /* FEW_CONTEXTS */ -} - -#ifdef FEW_CONTEXTS -/* - * Steal a context from a task that has one at the moment. - * This is only used on 8xx and 4xx and we presently assume that - * they don't do SMP. If they do then this will have to check - * whether the MM we steal is in use. - * We also assume that this is only used on systems that don't - * use an MMU hash table - this is true for 8xx and 4xx. - * This isn't an LRU system, it just frees up each context in - * turn (sort-of pseudo-random replacement :). This would be the - * place to implement an LRU scheme if anyone was motivated to do it. - * -- paulus - */ -void -steal_context(void) -{ - struct mm_struct *mm; - - /* free up context `next_mmu_context' */ - /* if we shouldn't free context 0, don't... */ - if (next_mmu_context < FIRST_CONTEXT) - next_mmu_context = FIRST_CONTEXT; - mm = context_mm[next_mmu_context]; - flush_tlb_mm(mm); - destroy_context(mm); -} -#endif /* FEW_CONTEXTS */ diff --git a/arch/powerpc/mm/mmu_context_64.c b/arch/powerpc/mm/mmu_context_64.c deleted file mode 100644 index 1db38ba1f54..00000000000 --- a/arch/powerpc/mm/mmu_context_64.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * MMU context allocation for 64-bit kernels. - * - * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/idr.h> - -#include <asm/mmu_context.h> - -static DEFINE_SPINLOCK(mmu_context_lock); -static DEFINE_IDR(mmu_context_idr); - -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - int index; - int err; - -again: - if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(&mmu_context_lock); - err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index); - spin_unlock(&mmu_context_lock); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > MAX_CONTEXT) { - spin_lock(&mmu_context_lock); - idr_remove(&mmu_context_idr, index); - spin_unlock(&mmu_context_lock); - return -ENOMEM; - } - - /* The old code would re-promote on fork, we don't do that - * when using slices as it could cause problem promoting slices - * that have been forced down to 4K - */ - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); - mm->context.id = index; - - return 0; -} - -void destroy_context(struct mm_struct *mm) -{ - spin_lock(&mmu_context_lock); - idr_remove(&mmu_context_idr, mm->context.id); - spin_unlock(&mmu_context_lock); - - mm->context.id = NO_CONTEXT; -} diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c new file mode 100644 index 00000000000..78fef6726e1 --- /dev/null +++ b/arch/powerpc/mm/mmu_context_hash32.c @@ -0,0 +1,119 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU substantially follows the + * architecture specification. This includes the 6xx, 7xx, 7xxx, + * 8260, and POWER3 implementations but excludes the 8xx and 4xx. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/export.h> + +#include <asm/mmu_context.h> +#include <asm/tlbflush.h> + +/* + * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs + * (virtual segment identifiers) for each context. Although the + * hardware supports 24-bit VSIDs, and thus >1 million contexts, + * we only use 32,768 of them. That is ample, since there can be + * at most around 30,000 tasks in the system anyway, and it means + * that we can use a bitmap to indicate which contexts are in use. + * Using a bitmap means that we entirely avoid all of the problems + * that we used to have when the context number overflowed, + * particularly on SMP systems. + * -- paulus. + */ +#define NO_CONTEXT ((unsigned long) -1) +#define LAST_CONTEXT 32767 +#define FIRST_CONTEXT 1 + +/* + * This function defines the mapping from contexts to VSIDs (virtual + * segment IDs). We use a skew on both the context and the high 4 bits + * of the 32-bit virtual address (the "effective segment ID") in order + * to spread out the entries in the MMU hash table. Note, if this + * function is changed then arch/ppc/mm/hashtable.S will have to be + * changed to correspond. + * + * + * CTX_TO_VSID(ctx, va) (((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \ + * & 0xffffff) + */ + +static unsigned long next_mmu_context; +static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1]; + +unsigned long __init_new_context(void) +{ + unsigned long ctx = next_mmu_context; + + while (test_and_set_bit(ctx, context_map)) { + ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx); + if (ctx > LAST_CONTEXT) + ctx = 0; + } + next_mmu_context = (ctx + 1) & LAST_CONTEXT; + + return ctx; +} +EXPORT_SYMBOL_GPL(__init_new_context); + +/* + * Set up the context for a new address space. + */ +int init_new_context(struct task_struct *t, struct mm_struct *mm) +{ + mm->context.id = __init_new_context(); + + return 0; +} + +/* + * Free a context ID. Make sure to call this with preempt disabled! + */ +void __destroy_context(unsigned long ctx) +{ + clear_bit(ctx, context_map); +} +EXPORT_SYMBOL_GPL(__destroy_context); + +/* + * We're finished using the context for an address space. + */ +void destroy_context(struct mm_struct *mm) +{ + preempt_disable(); + if (mm->context.id != NO_CONTEXT) { + __destroy_context(mm->context.id); + mm->context.id = NO_CONTEXT; + } + preempt_enable(); +} + +/* + * Initialize the context management stuff. + */ +void __init mmu_context_init(void) +{ + /* Reserve context 0 for kernel use */ + context_map[0] = (1 << FIRST_CONTEXT) - 1; + next_mmu_context = FIRST_CONTEXT; +} diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c new file mode 100644 index 00000000000..178876aef40 --- /dev/null +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -0,0 +1,146 @@ +/* + * MMU context allocation for 64-bit kernels. + * + * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/idr.h> +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/slab.h> + +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> + +#include "icswx.h" + +static DEFINE_SPINLOCK(mmu_context_lock); +static DEFINE_IDA(mmu_context_ida); + +int __init_new_context(void) +{ + int index; + int err; + +again: + if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&mmu_context_lock); + err = ida_get_new_above(&mmu_context_ida, 1, &index); + spin_unlock(&mmu_context_lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > MAX_USER_CONTEXT) { + spin_lock(&mmu_context_lock); + ida_remove(&mmu_context_ida, index); + spin_unlock(&mmu_context_lock); + return -ENOMEM; + } + + return index; +} +EXPORT_SYMBOL_GPL(__init_new_context); + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + int index; + + index = __init_new_context(); + if (index < 0) + return index; + + /* The old code would re-promote on fork, we don't do that + * when using slices as it could cause problem promoting slices + * that have been forced down to 4K + */ + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); + subpage_prot_init_new_context(mm); + mm->context.id = index; +#ifdef CONFIG_PPC_ICSWX + mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (!mm->context.cop_lockp) { + __destroy_context(index); + subpage_prot_free(mm); + mm->context.id = MMU_NO_CONTEXT; + return -ENOMEM; + } + spin_lock_init(mm->context.cop_lockp); +#endif /* CONFIG_PPC_ICSWX */ + +#ifdef CONFIG_PPC_64K_PAGES + mm->context.pte_frag = NULL; +#endif + return 0; +} + +void __destroy_context(int context_id) +{ + spin_lock(&mmu_context_lock); + ida_remove(&mmu_context_ida, context_id); + spin_unlock(&mmu_context_lock); +} +EXPORT_SYMBOL_GPL(__destroy_context); + +#ifdef CONFIG_PPC_64K_PAGES +static void destroy_pagetable_page(struct mm_struct *mm) +{ + int count; + void *pte_frag; + struct page *page; + + pte_frag = mm->context.pte_frag; + if (!pte_frag) + return; + + page = virt_to_page(pte_frag); + /* drop all the pending references */ + count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; + /* We allow PTE_FRAG_NR fragments from a PTE page */ + count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count); + if (!count) { + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } +} + +#else +static inline void destroy_pagetable_page(struct mm_struct *mm) +{ + return; +} +#endif + + +void destroy_context(struct mm_struct *mm) +{ + +#ifdef CONFIG_PPC_ICSWX + drop_cop(mm->context.acop, mm); + kfree(mm->context.cop_lockp); + mm->context.cop_lockp = NULL; +#endif /* CONFIG_PPC_ICSWX */ + + destroy_pagetable_page(mm); + __destroy_context(mm->context.id); + subpage_prot_free(mm); + mm->context.id = MMU_NO_CONTEXT; +} diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c new file mode 100644 index 00000000000..928ebe79668 --- /dev/null +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -0,0 +1,449 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU is not using the hash + * table, such as 8xx, 4xx, BookE's etc... + * + * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org> + * IBM Corp. + * + * Derived from previous arch/powerpc/mm/mmu_context.c + * and arch/powerpc/include/asm/mmu_context.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * TODO: + * + * - The global context lock will not scale very well + * - The maps should be dynamically allocated to allow for processors + * that support more PID bits at runtime + * - Implement flush_tlb_mm() by making the context stale and picking + * a new one + * - More aggressively clear stale map bits and maybe find some way to + * also clear mm->cpu_vm_mask bits when processes are migrated + */ + +//#define DEBUG_MAP_CONSISTENCY +//#define DEBUG_CLAMP_LAST_CONTEXT 31 +//#define DEBUG_HARDER + +/* We don't use DEBUG because it tends to be compiled in always nowadays + * and this would generate way too much output + */ +#ifdef DEBUG_HARDER +#define pr_hard(args...) printk(KERN_DEBUG args) +#define pr_hardcont(args...) printk(KERN_CONT args) +#else +#define pr_hard(args...) do { } while(0) +#define pr_hardcont(args...) do { } while(0) +#endif + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/bootmem.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/slab.h> + +#include <asm/mmu_context.h> +#include <asm/tlbflush.h> + +static unsigned int first_context, last_context; +static unsigned int next_context, nr_free_contexts; +static unsigned long *context_map; +static unsigned long *stale_map[NR_CPUS]; +static struct mm_struct **context_mm; +static DEFINE_RAW_SPINLOCK(context_lock); + +#define CTX_MAP_SIZE \ + (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1)) + + +/* Steal a context from a task that has one at the moment. + * + * This is used when we are running out of available PID numbers + * on the processors. + * + * This isn't an LRU system, it just frees up each context in + * turn (sort-of pseudo-random replacement :). This would be the + * place to implement an LRU scheme if anyone was motivated to do it. + * -- paulus + * + * For context stealing, we use a slightly different approach for + * SMP and UP. Basically, the UP one is simpler and doesn't use + * the stale map as we can just flush the local CPU + * -- benh + */ +#ifdef CONFIG_SMP +static unsigned int steal_context_smp(unsigned int id) +{ + struct mm_struct *mm; + unsigned int cpu, max, i; + + max = last_context - first_context; + + /* Attempt to free next_context first and then loop until we manage */ + while (max--) { + /* Pick up the victim mm */ + mm = context_mm[id]; + + /* We have a candidate victim, check if it's active, on SMP + * we cannot steal active contexts + */ + if (mm->context.active) { + id++; + if (id > last_context) + id = first_context; + continue; + } + pr_hardcont(" | steal %d from 0x%p", id, mm); + + /* Mark this mm has having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + + /* Mark it stale on all CPUs that used this mm. For threaded + * implementations, we set it on all threads on each core + * represented in the mask. A future implementation will use + * a core map instead but this will do for now. + */ + for_each_cpu(cpu, mm_cpumask(mm)) { + for (i = cpu_first_thread_sibling(cpu); + i <= cpu_last_thread_sibling(cpu); i++) { + if (stale_map[i]) + __set_bit(id, stale_map[i]); + } + cpu = i - 1; + } + return id; + } + + /* This will happen if you have more CPUs than available contexts, + * all we can do here is wait a bit and try again + */ + raw_spin_unlock(&context_lock); + cpu_relax(); + raw_spin_lock(&context_lock); + + /* This will cause the caller to try again */ + return MMU_NO_CONTEXT; +} +#endif /* CONFIG_SMP */ + +/* Note that this will also be called on SMP if all other CPUs are + * offlined, which means that it may be called for cpu != 0. For + * this to work, we somewhat assume that CPUs that are onlined + * come up with a fully clean TLB (or are cleaned when offlined) + */ +static unsigned int steal_context_up(unsigned int id) +{ + struct mm_struct *mm; + int cpu = smp_processor_id(); + + /* Pick up the victim mm */ + mm = context_mm[id]; + + pr_hardcont(" | steal %d from 0x%p", id, mm); + + /* Flush the TLB for that context */ + local_flush_tlb_mm(mm); + + /* Mark this mm has having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + + /* XXX This clear should ultimately be part of local_flush_tlb_mm */ + __clear_bit(id, stale_map[cpu]); + + return id; +} + +#ifdef DEBUG_MAP_CONSISTENCY +static void context_check_map(void) +{ + unsigned int id, nrf, nact; + + nrf = nact = 0; + for (id = first_context; id <= last_context; id++) { + int used = test_bit(id, context_map); + if (!used) + nrf++; + if (used != (context_mm[id] != NULL)) + pr_err("MMU: Context %d is %s and MM is %p !\n", + id, used ? "used" : "free", context_mm[id]); + if (context_mm[id] != NULL) + nact += context_mm[id]->context.active; + } + if (nrf != nr_free_contexts) { + pr_err("MMU: Free context count out of sync ! (%d vs %d)\n", + nr_free_contexts, nrf); + nr_free_contexts = nrf; + } + if (nact > num_online_cpus()) + pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n", + nact, num_online_cpus()); + if (first_context > 0 && !test_bit(0, context_map)) + pr_err("MMU: Context 0 has been freed !!!\n"); +} +#else +static void context_check_map(void) { } +#endif + +void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + unsigned int i, id, cpu = smp_processor_id(); + unsigned long *map; + + /* No lockless fast path .. yet */ + raw_spin_lock(&context_lock); + + pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", + cpu, next, next->context.active, next->context.id); + +#ifdef CONFIG_SMP + /* Mark us active and the previous one not anymore */ + next->context.active++; + if (prev) { + pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active); + WARN_ON(prev->context.active < 1); + prev->context.active--; + } + + again: +#endif /* CONFIG_SMP */ + + /* If we already have a valid assigned context, skip all that */ + id = next->context.id; + if (likely(id != MMU_NO_CONTEXT)) { +#ifdef DEBUG_MAP_CONSISTENCY + if (context_mm[id] != next) + pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n", + next, id, id, context_mm[id]); +#endif + goto ctxt_ok; + } + + /* We really don't have a context, let's try to acquire one */ + id = next_context; + if (id > last_context) + id = first_context; + map = context_map; + + /* No more free contexts, let's try to steal one */ + if (nr_free_contexts == 0) { +#ifdef CONFIG_SMP + if (num_online_cpus() > 1) { + id = steal_context_smp(id); + if (id == MMU_NO_CONTEXT) + goto again; + goto stolen; + } +#endif /* CONFIG_SMP */ + id = steal_context_up(id); + goto stolen; + } + nr_free_contexts--; + + /* We know there's at least one free context, try to find it */ + while (__test_and_set_bit(id, map)) { + id = find_next_zero_bit(map, last_context+1, id); + if (id > last_context) + id = first_context; + } + stolen: + next_context = id + 1; + context_mm[id] = next; + next->context.id = id; + pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts); + + context_check_map(); + ctxt_ok: + + /* If that context got marked stale on this CPU, then flush the + * local TLB for it and unmark it before we use it + */ + if (test_bit(id, stale_map[cpu])) { + pr_hardcont(" | stale flush %d [%d..%d]", + id, cpu_first_thread_sibling(cpu), + cpu_last_thread_sibling(cpu)); + + local_flush_tlb_mm(next); + + /* XXX This clear should ultimately be part of local_flush_tlb_mm */ + for (i = cpu_first_thread_sibling(cpu); + i <= cpu_last_thread_sibling(cpu); i++) { + if (stale_map[i]) + __clear_bit(id, stale_map[i]); + } + } + + /* Flick the MMU and release lock */ + pr_hardcont(" -> %d\n", id); + set_context(id, next->pgd); + raw_spin_unlock(&context_lock); +} + +/* + * Set up the context for a new address space. + */ +int init_new_context(struct task_struct *t, struct mm_struct *mm) +{ + pr_hard("initing context for mm @%p\n", mm); + + mm->context.id = MMU_NO_CONTEXT; + mm->context.active = 0; + +#ifdef CONFIG_PPC_MM_SLICES + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); +#endif + + return 0; +} + +/* + * We're finished using the context for an address space. + */ +void destroy_context(struct mm_struct *mm) +{ + unsigned long flags; + unsigned int id; + + if (mm->context.id == MMU_NO_CONTEXT) + return; + + WARN_ON(mm->context.active != 0); + + raw_spin_lock_irqsave(&context_lock, flags); + id = mm->context.id; + if (id != MMU_NO_CONTEXT) { + __clear_bit(id, context_map); + mm->context.id = MMU_NO_CONTEXT; +#ifdef DEBUG_MAP_CONSISTENCY + mm->context.active = 0; +#endif + context_mm[id] = NULL; + nr_free_contexts++; + } + raw_spin_unlock_irqrestore(&context_lock, flags); +} + +#ifdef CONFIG_SMP + +static int mmu_context_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned int)(long)hcpu; + + /* We don't touch CPU 0 map, it's allocated at aboot and kept + * around forever + */ + if (cpu == boot_cpuid) + return NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); + stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); + kfree(stale_map[cpu]); + stale_map[cpu] = NULL; + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + clear_tasks_mm_cpumask(cpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block mmu_context_cpu_nb = { + .notifier_call = mmu_context_cpu_notify, +}; + +#endif /* CONFIG_SMP */ + +/* + * Initialize the context management stuff. + */ +void __init mmu_context_init(void) +{ + /* Mark init_mm as being active on all possible CPUs since + * we'll get called with prev == init_mm the first time + * we schedule on a given CPU + */ + init_mm.context.active = NR_CPUS; + + /* + * The MPC8xx has only 16 contexts. We rotate through them on each + * task switch. A better way would be to keep track of tasks that + * own contexts, and implement an LRU usage. That way very active + * tasks don't always have to pay the TLB reload overhead. The + * kernel pages are mapped shared, so the kernel can run on behalf + * of any task that makes a kernel entry. Shared does not mean they + * are not protected, just that the ASID comparison is not performed. + * -- Dan + * + * The IBM4xx has 256 contexts, so we can just rotate through these + * as a way of "switching" contexts. If the TID of the TLB is zero, + * the PID/TID comparison is disabled, so we can use a TID of zero + * to represent all kernel pages as shared among all contexts. + * -- Dan + * + * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We + * should normally never have to steal though the facility is + * present if needed. + * -- BenH + */ + if (mmu_has_feature(MMU_FTR_TYPE_8xx)) { + first_context = 0; + last_context = 15; + } else if (mmu_has_feature(MMU_FTR_TYPE_47x)) { + first_context = 1; + last_context = 65535; + } else { + first_context = 1; + last_context = 255; + } + +#ifdef DEBUG_CLAMP_LAST_CONTEXT + last_context = DEBUG_CLAMP_LAST_CONTEXT; +#endif + /* + * Allocate the maps used by context management + */ + context_map = alloc_bootmem(CTX_MAP_SIZE); + context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1)); +#ifndef CONFIG_SMP + stale_map[0] = alloc_bootmem(CTX_MAP_SIZE); +#else + stale_map[boot_cpuid] = alloc_bootmem(CTX_MAP_SIZE); + + register_cpu_notifier(&mmu_context_cpu_nb); +#endif + + printk(KERN_INFO + "MMU: Allocated %zu bytes of context maps for %d contexts\n", + 2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)), + last_context - first_context + 1); + + /* + * Some processors have too few contexts to reserve one for + * init_mm, and require using context 0 for a normal task. + * Other processors reserve the use of context zero for the kernel. + * This code assumes first_context < 32. + */ + context_map[0] = (1 << first_context) - 1; + next_context = first_context; + nr_free_contexts = last_context - first_context + 1; +} + diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index ebfd13dc9d1..9615d82919b 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -22,20 +22,84 @@ #include <asm/tlbflush.h> #include <asm/mmu.h> +#ifdef CONFIG_PPC_MMU_NOHASH + +/* + * On 40x and 8xx, we directly inline tlbia and tlbivax + */ +#if defined(CONFIG_40x) || defined(CONFIG_8xx) +static inline void _tlbil_all(void) +{ + asm volatile ("sync; tlbia; isync" : : : "memory"); +} +static inline void _tlbil_pid(unsigned int pid) +{ + asm volatile ("sync; tlbia; isync" : : : "memory"); +} +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) + +#else /* CONFIG_40x || CONFIG_8xx */ +extern void _tlbil_all(void); +extern void _tlbil_pid(unsigned int pid); +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbil_pid_noind(unsigned int pid); +#else +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) +#endif +#endif /* !(CONFIG_40x || CONFIG_8xx) */ + +/* + * On 8xx, we directly inline tlbie, on others, it's extern + */ +#ifdef CONFIG_8xx +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); +} +#elif defined(CONFIG_PPC_BOOK3E) +extern void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else +extern void __tlbil_va(unsigned long address, unsigned int pid); +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + __tlbil_va(address, pid); +} +#endif /* CONIFG_8xx */ + +#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x) +extern void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else +static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + BUG(); +} +#endif + +#else /* CONFIG_PPC_MMU_NOHASH */ + extern void hash_preload(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap); +extern void _tlbie(unsigned long address); +extern void _tlbia(void); + +#endif /* CONFIG_PPC_MMU_NOHASH */ + #ifdef CONFIG_PPC32 + extern void mapin_ram(void); extern int map_page(unsigned long va, phys_addr_t pa, int flags); -extern void setbat(int index, unsigned long virt, unsigned long phys, +extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, int flags); -extern void settlbcam(int index, unsigned long virt, phys_addr_t phys, - unsigned int size, int flags, unsigned int pid); -extern void invalidate_tlbcam_entry(int index); extern int __map_without_bats; +extern int __allow_ioremap_reserved; extern unsigned long ioremap_base; extern unsigned int rtas_data, rtas_size; @@ -43,50 +107,61 @@ struct hash_pte; extern struct hash_pte *Hash, *Hash_end; extern unsigned long Hash_size, Hash_mask; -extern unsigned int num_tlbcam_entries; -#endif +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC64 +extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags); +#endif /* CONFIG_PPC64 */ extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; -extern unsigned long __initial_memory_limit; -extern unsigned long total_memory; -extern unsigned long total_lowmem; +extern phys_addr_t __initial_memory_limit_addr; +extern phys_addr_t total_memory; +extern phys_addr_t total_lowmem; +extern phys_addr_t memstart_addr; +extern phys_addr_t lowmem_end_addr; + +#ifdef CONFIG_WII +extern unsigned long wii_hole_start; +extern unsigned long wii_hole_size; + +extern unsigned long wii_mmu_mapin_mem2(unsigned long top); +extern void wii_memory_fixups(void); +#endif /* ...and now those things that may be slightly different between processor * architectures. -- Dan */ #if defined(CONFIG_8xx) -#define flush_HPTE(X, va, pg) _tlbie(va, 0 /* 8xx doesn't care about PID */) #define MMU_init_hw() do { } while(0) -#define mmu_mapin_ram() (0UL) +#define mmu_mapin_ram(top) (0UL) #elif defined(CONFIG_4xx) -#define flush_HPTE(pid, va, pg) _tlbie(va, pid) extern void MMU_init_hw(void); -extern unsigned long mmu_mapin_ram(void); +extern unsigned long mmu_mapin_ram(unsigned long top); -#elif defined(CONFIG_FSL_BOOKE) -#define flush_HPTE(pid, va, pg) _tlbie(va, pid) +#elif defined(CONFIG_PPC_FSL_BOOK3E) +extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx); +extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys); +#ifdef CONFIG_PPC32 extern void MMU_init_hw(void); -extern unsigned long mmu_mapin_ram(void); +extern unsigned long mmu_mapin_ram(unsigned long top); extern void adjust_total_lowmem(void); +extern int switch_to_as1(void); +extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); +#endif +extern void loadcam_entry(unsigned int index); +struct tlbcam { + u32 MAS0; + u32 MAS1; + unsigned long MAS2; + u32 MAS3; + u32 MAS7; +}; #elif defined(CONFIG_PPC32) /* anything 32-bit except 4xx or 8xx */ extern void MMU_init_hw(void); -extern unsigned long mmu_mapin_ram(void); - -/* Be careful....this needs to be updated if we ever encounter 603 SMPs, - * which includes all new 82xx processors. We need tlbie/tlbsync here - * in that case (I think). -- Dan. - */ -static inline void flush_HPTE(unsigned context, unsigned long va, - unsigned long pdval) -{ - if ((Hash != 0) && - cpu_has_feature(CPU_FTR_HPTE_TABLE)) - flush_hash_pages(0, va, pdval, 1); - else - _tlbie(va); -} +extern unsigned long mmu_mapin_ram(unsigned long top); #endif diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index a300d254aac..3b181b22cd4 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -13,14 +13,31 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/mmzone.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/nodemask.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/memblock.h> +#include <linux/of.h> +#include <linux/pfn.h> +#include <linux/cpuset.h> +#include <linux/node.h> +#include <linux/stop_machine.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <asm/cputhreads.h> #include <asm/sparsemem.h> -#include <asm/lmb.h> -#include <asm/system.h> +#include <asm/prom.h> #include <asm/smp.h> +#include <asm/cputhreads.h> +#include <asm/topology.h> +#include <asm/firmware.h> +#include <asm/paca.h> +#include <asm/hvcall.h> +#include <asm/setup.h> +#include <asm/vdso.h> static int numa_enabled = 1; @@ -30,18 +47,45 @@ static int numa_debug; #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } int numa_cpu_lookup_table[NR_CPUS]; -cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); -EXPORT_SYMBOL(numa_cpumask_lookup_table); +EXPORT_SYMBOL(node_to_cpumask_map); EXPORT_SYMBOL(node_data); -static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; static int min_common_depth; static int n_mem_addr_cells, n_mem_size_cells; +static int form1_affinity; -static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, +#define MAX_DISTANCE_REF_POINTS 4 +static int distance_ref_points_depth; +static const __be32 *distance_ref_points; +static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: cpumask_of_node() is not valid until after this is done. + */ +static void __init setup_node_to_cpumask_map(void) +{ + unsigned int node; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) + setup_nr_node_ids(); + + /* allocate the map */ + for (node = 0; node < nr_node_ids; node++) + alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); + + /* cpumask_of_node() will now work */ + dbg("Node to cpumask map for %d nodes\n", nr_node_ids); +} + +static int __init fake_numa_create_new_node(unsigned long end_pfn, unsigned int *nid) { unsigned long long mem; @@ -88,92 +132,163 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, return 0; } -static void __cpuinit map_cpu_to_node(int cpu, int node) +/* + * get_node_active_region - Return active region containing pfn + * Active range returned is empty if none found. + * @pfn: The page to return the region for + * @node_ar: Returned set to the active region containing @pfn + */ +static void __init get_node_active_region(unsigned long pfn, + struct node_active_region *node_ar) +{ + unsigned long start_pfn, end_pfn; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + if (pfn >= start_pfn && pfn < end_pfn) { + node_ar->nid = nid; + node_ar->start_pfn = start_pfn; + node_ar->end_pfn = end_pfn; + break; + } + } +} + +static void reset_numa_cpu_lookup_table(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + numa_cpu_lookup_table[cpu] = -1; +} + +static void update_numa_cpu_lookup_table(unsigned int cpu, int node) { numa_cpu_lookup_table[cpu] = node; +} + +static void map_cpu_to_node(int cpu, int node) +{ + update_numa_cpu_lookup_table(cpu, node); dbg("adding cpu %d to node %d\n", cpu, node); - if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) - cpu_set(cpu, numa_cpumask_lookup_table[node]); + if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) + cpumask_set_cpu(cpu, node_to_cpumask_map[node]); } -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) static void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; dbg("removing cpu %lu from node %d\n", cpu, node); - if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { - cpu_clear(cpu, numa_cpumask_lookup_table[node]); + if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { + cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); } else { printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", cpu, node); } } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ + +/* must hold reference to node during call */ +static const __be32 *of_get_associativity(struct device_node *dev) +{ + return of_get_property(dev, "ibm,associativity", NULL); +} -static struct device_node * __cpuinit find_cpu_node(unsigned int cpu) +/* + * Returns the property linux,drconf-usable-memory if + * it exists (the property exists only in kexec/kdump kernels, + * added by kexec-tools) + */ +static const __be32 *of_get_usable_memory(struct device_node *memory) { - unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); - struct device_node *cpu_node = NULL; - const unsigned int *interrupt_server, *reg; - int len; + const __be32 *prop; + u32 len; + prop = of_get_property(memory, "linux,drconf-usable-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return NULL; + return prop; +} - while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) { - /* Try interrupt server first */ - interrupt_server = of_get_property(cpu_node, - "ibm,ppc-interrupt-server#s", &len); +int __node_distance(int a, int b) +{ + int i; + int distance = LOCAL_DISTANCE; - len = len / sizeof(u32); + if (!form1_affinity) + return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); - if (interrupt_server && (len > 0)) { - while (len--) { - if (interrupt_server[len] == hw_cpuid) - return cpu_node; - } - } else { - reg = of_get_property(cpu_node, "reg", &len); - if (reg && (len > 0) && (reg[0] == hw_cpuid)) - return cpu_node; - } + for (i = 0; i < distance_ref_points_depth; i++) { + if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) + break; + + /* Double the distance for each NUMA level */ + distance *= 2; } - return NULL; + return distance; } +EXPORT_SYMBOL(__node_distance); -/* must hold reference to node during call */ -static const int *of_get_associativity(struct device_node *dev) +static void initialize_distance_lookup_table(int nid, + const __be32 *associativity) { - return of_get_property(dev, "ibm,associativity", NULL); + int i; + + if (!form1_affinity) + return; + + for (i = 0; i < distance_ref_points_depth; i++) { + const __be32 *entry; + + entry = &associativity[be32_to_cpu(distance_ref_points[i])]; + distance_lookup_table[nid][i] = of_read_number(entry, 1); + } } /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa * info is found. */ -static int of_node_to_nid_single(struct device_node *device) +static int associativity_to_nid(const __be32 *associativity) { int nid = -1; - const unsigned int *tmp; if (min_common_depth == -1) goto out; - tmp = of_get_associativity(device); - if (!tmp) - goto out; - - if (tmp[0] >= min_common_depth) - nid = tmp[min_common_depth]; + if (of_read_number(associativity, 1) >= min_common_depth) + nid = of_read_number(&associativity[min_common_depth], 1); /* POWER4 LPAR uses 0xffff as invalid node */ if (nid == 0xffff || nid >= MAX_NUMNODES) nid = -1; + + if (nid > 0 && + of_read_number(associativity, 1) >= distance_ref_points_depth) + initialize_distance_lookup_table(nid, associativity); + out: return nid; } +/* Returns the nid associated with the given device tree node, + * or -1 if not found. + */ +static int of_node_to_nid_single(struct device_node *device) +{ + int nid = -1; + const __be32 *tmp; + + tmp = of_get_associativity(device); + if (tmp) + nid = associativity_to_nid(tmp); + return nid; +} + /* Walk the device tree upwards, looking for an associativity id */ int of_node_to_nid(struct device_node *device) { @@ -196,50 +311,75 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL_GPL(of_node_to_nid); -/* - * In theory, the "ibm,associativity" property may contain multiple - * associativity lists because a resource may be multiply connected - * into the machine. This resource then has different associativity - * characteristics relative to its multiple connections. We ignore - * this for now. We also assume that all cpu and memory sets have - * their distances represented at a common level. This won't be - * true for hierarchical NUMA. - * - * In any case the ibm,associativity-reference-points should give - * the correct depth for a normal NUMA system. - * - * - Dave Hansen <haveblue@us.ibm.com> - */ static int __init find_min_common_depth(void) { int depth; - const unsigned int *ref_points; - struct device_node *rtas_root; - unsigned int len; - - rtas_root = of_find_node_by_path("/rtas"); + struct device_node *root; - if (!rtas_root) - return -1; + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else + root = of_find_node_by_path("/rtas"); + if (!root) + root = of_find_node_by_path("/"); /* - * this property is 2 32-bit integers, each representing a level of - * depth in the associativity nodes. The first is for an SMP - * configuration (should be all 0's) and the second is for a normal - * NUMA configuration. + * This property is a set of 32-bit integers, each representing + * an index into the ibm,associativity nodes. + * + * With form 0 affinity the first integer is for an SMP configuration + * (should be all 0's) and the second is for a normal NUMA + * configuration. We have only one level of NUMA. + * + * With form 1 affinity the first integer is the most significant + * NUMA boundary and the following are progressively less significant + * boundaries. There can be more than one level of NUMA. */ - ref_points = of_get_property(rtas_root, - "ibm,associativity-reference-points", &len); + distance_ref_points = of_get_property(root, + "ibm,associativity-reference-points", + &distance_ref_points_depth); - if ((len >= 1) && ref_points) { - depth = ref_points[1]; - } else { + if (!distance_ref_points) { dbg("NUMA: ibm,associativity-reference-points not found.\n"); - depth = -1; + goto err; + } + + distance_ref_points_depth /= sizeof(int); + + if (firmware_has_feature(FW_FEATURE_OPAL) || + firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { + dbg("Using form 1 affinity\n"); + form1_affinity = 1; + } + + if (form1_affinity) { + depth = of_read_number(distance_ref_points, 1); + } else { + if (distance_ref_points_depth < 2) { + printk(KERN_WARNING "NUMA: " + "short ibm,associativity-reference-points\n"); + goto err; + } + + depth = of_read_number(&distance_ref_points[1], 1); } - of_node_put(rtas_root); + /* + * Warn and cap if the hardware supports more than + * MAX_DISTANCE_REF_POINTS domains. + */ + if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { + printk(KERN_WARNING "NUMA: distance array capped at " + "%d entries\n", MAX_DISTANCE_REF_POINTS); + distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; + } + + of_node_put(root); return depth; + +err: + of_node_put(root); + return -1; } static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) @@ -255,35 +395,174 @@ static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) of_node_put(memory); } -static unsigned long __devinit read_n_cells(int n, const unsigned int **buf) +static unsigned long read_n_cells(int n, const __be32 **buf) { unsigned long result = 0; while (n--) { - result = (result << 32) | **buf; + result = (result << 32) | of_read_number(*buf, 1); (*buf)++; } return result; } /* + * Read the next memblock list entry from the ibm,dynamic-memory property + * and return the information in the provided of_drconf_cell structure. + */ +static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp) +{ + const __be32 *cp; + + drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp); + + cp = *cellp; + drmem->drc_index = of_read_number(cp, 1); + drmem->reserved = of_read_number(&cp[1], 1); + drmem->aa_index = of_read_number(&cp[2], 1); + drmem->flags = of_read_number(&cp[3], 1); + + *cellp = cp + 4; +} + +/* + * Retrieve and validate the ibm,dynamic-memory property of the device tree. + * + * The layout of the ibm,dynamic-memory property is a number N of memblock + * list entries followed by N memblock list entries. Each memblock list entry + * contains information as laid out in the of_drconf_cell struct above. + */ +static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm) +{ + const __be32 *prop; + u32 len, entries; + + prop = of_get_property(memory, "ibm,dynamic-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + + entries = of_read_number(prop++, 1); + + /* Now that we know the number of entries, revalidate the size + * of the property read in to ensure we have everything + */ + if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) + return 0; + + *dm = prop; + return entries; +} + +/* + * Retrieve and validate the ibm,lmb-size property for drconf memory + * from the device tree. + */ +static u64 of_get_lmb_size(struct device_node *memory) +{ + const __be32 *prop; + u32 len; + + prop = of_get_property(memory, "ibm,lmb-size", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + + return read_n_cells(n_mem_size_cells, &prop); +} + +struct assoc_arrays { + u32 n_arrays; + u32 array_sz; + const __be32 *arrays; +}; + +/* + * Retrieve and validate the list of associativity arrays for drconf + * memory from the ibm,associativity-lookup-arrays property of the + * device tree.. + * + * The layout of the ibm,associativity-lookup-arrays property is a number N + * indicating the number of associativity arrays, followed by a number M + * indicating the size of each associativity array, followed by a list + * of N associativity arrays. + */ +static int of_get_assoc_arrays(struct device_node *memory, + struct assoc_arrays *aa) +{ + const __be32 *prop; + u32 len; + + prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); + if (!prop || len < 2 * sizeof(unsigned int)) + return -1; + + aa->n_arrays = of_read_number(prop++, 1); + aa->array_sz = of_read_number(prop++, 1); + + /* Now that we know the number of arrays and size of each array, + * revalidate the size of the property read in. + */ + if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) + return -1; + + aa->arrays = prop; + return 0; +} + +/* + * This is like of_node_to_nid_single() for memory represented in the + * ibm,dynamic-reconfiguration-memory node. + */ +static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, + struct assoc_arrays *aa) +{ + int default_nid = 0; + int nid = default_nid; + int index; + + if (min_common_depth > 0 && min_common_depth <= aa->array_sz && + !(drmem->flags & DRCONF_MEM_AI_INVALID) && + drmem->aa_index < aa->n_arrays) { + index = drmem->aa_index * aa->array_sz + min_common_depth - 1; + nid = of_read_number(&aa->arrays[index], 1); + + if (nid == 0xffff || nid >= MAX_NUMNODES) + nid = default_nid; + } + + return nid; +} + +/* * Figure out to which domain a cpu belongs and stick it there. * Return the id of the domain used. */ -static int __cpuinit numa_setup_cpu(unsigned long lcpu) +static int numa_setup_cpu(unsigned long lcpu) { - int nid = 0; - struct device_node *cpu = find_cpu_node(lcpu); + int nid; + struct device_node *cpu; + + /* + * If a valid cpu-to-node mapping is already available, use it + * directly instead of querying the firmware, since it represents + * the most recent mapping notified to us by the platform (eg: VPHN). + */ + if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) { + map_cpu_to_node(lcpu, nid); + return nid; + } + + cpu = of_get_cpu_node(lcpu, NULL); if (!cpu) { WARN_ON(1); + nid = 0; goto out; } nid = of_node_to_nid_single(cpu); if (nid < 0 || !node_online(nid)) - nid = any_online_node(NODE_MASK_ALL); + nid = first_online_node; out: map_cpu_to_node(lcpu, nid); @@ -292,17 +571,38 @@ out: return nid; } -static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, - unsigned long action, +static void verify_cpu_node_mapping(int cpu, int node) +{ + int base, sibling, i; + + /* Verify that all the threads in the core belong to the same node */ + base = cpu_first_thread_sibling(cpu); + + for (i = 0; i < threads_per_core; i++) { + sibling = base + i; + + if (sibling == cpu || cpu_is_offline(sibling)) + continue; + + if (cpu_to_node(sibling) != node) { + WARN(1, "CPU thread siblings %d and %d don't belong" + " to the same node!\n", cpu, sibling); + break; + } + } +} + +static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned long lcpu = (unsigned long)hcpu; - int ret = NOTIFY_DONE; + int ret = NOTIFY_DONE, nid; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - numa_setup_cpu(lcpu); + nid = numa_setup_cpu(lcpu); + verify_cpu_node_mapping((int)lcpu, nid); ret = NOTIFY_OK; break; #ifdef CONFIG_HOTPLUG_CPU @@ -324,27 +624,40 @@ static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, * Returns the size the region should have to enforce the memory limit. * This will either be the original value of size, a truncated value, * or zero. If the returned value of size is 0 the region should be - * discarded as it lies wholy above the memory limit. + * discarded as it lies wholly above the memory limit. */ static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size) { /* - * We use lmb_end_of_DRAM() in here instead of memory_limit because + * We use memblock_end_of_DRAM() in here instead of memory_limit because * we've already adjusted it for the limit and it takes care of - * having memory holes below the limit. + * having memory holes below the limit. Also, in the case of + * iommu_is_off, memory_limit is not set but is implicitly enforced. */ - if (! memory_limit) - return size; - - if (start + size <= lmb_end_of_DRAM()) + if (start + size <= memblock_end_of_DRAM()) return size; - if (start >= lmb_end_of_DRAM()) + if (start >= memblock_end_of_DRAM()) return 0; - return lmb_end_of_DRAM() - start; + return memblock_end_of_DRAM() - start; +} + +/* + * Reads the counter for a given entry in + * linux,drconf-usable-memory property + */ +static inline int __init read_usm_ranges(const __be32 **usm) +{ + /* + * For each lmb in ibm,dynamic-memory a corresponding + * entry in linux,drconf-usable-memory property contains + * a counter followed by that many (base, size) duple. + * read the counter from linux,drconf-usable-memory + */ + return read_n_cells(n_mem_size_cells, usm); } /* @@ -353,64 +666,70 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start, */ static void __init parse_drconf_memory(struct device_node *memory) { - const unsigned int *lm, *dm, *aa; - unsigned int ls, ld, la; - unsigned int n, aam, aalen; - unsigned long lmb_size, size, start; - int nid, default_nid = 0; - unsigned int ai, flags; - - lm = of_get_property(memory, "ibm,lmb-size", &ls); - dm = of_get_property(memory, "ibm,dynamic-memory", &ld); - aa = of_get_property(memory, "ibm,associativity-lookup-arrays", &la); - if (!lm || !dm || !aa || - ls < sizeof(unsigned int) || ld < sizeof(unsigned int) || - la < 2 * sizeof(unsigned int)) + const __be32 *uninitialized_var(dm), *usm; + unsigned int n, rc, ranges, is_kexec_kdump = 0; + unsigned long lmb_size, base, size, sz; + int nid; + struct assoc_arrays aa = { .arrays = NULL }; + + n = of_get_drconf_memory(memory, &dm); + if (!n) + return; + + lmb_size = of_get_lmb_size(memory); + if (!lmb_size) return; - lmb_size = read_n_cells(n_mem_size_cells, &lm); - n = *dm++; /* number of LMBs */ - aam = *aa++; /* number of associativity lists */ - aalen = *aa++; /* length of each associativity list */ - if (ld < (n * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int) || - la < (aam * aalen + 2) * sizeof(unsigned int)) + rc = of_get_assoc_arrays(memory, &aa); + if (rc) return; + /* check if this is a kexec/kdump kernel */ + usm = of_get_usable_memory(memory); + if (usm != NULL) + is_kexec_kdump = 1; + for (; n != 0; --n) { - start = read_n_cells(n_mem_addr_cells, &dm); - ai = dm[2]; - flags = dm[3]; - dm += 4; - /* 0x80 == reserved, 0x8 = assigned to us */ - if ((flags & 0x80) || !(flags & 0x8)) - continue; - nid = default_nid; - /* flags & 0x40 means associativity index is invalid */ - if (min_common_depth > 0 && min_common_depth <= aalen && - (flags & 0x40) == 0 && ai < aam) { - /* this is like of_node_to_nid_single */ - nid = aa[ai * aalen + min_common_depth - 1]; - if (nid == 0xffff || nid >= MAX_NUMNODES) - nid = default_nid; - } + struct of_drconf_cell drmem; - fake_numa_create_new_node(((start + lmb_size) >> PAGE_SHIFT), - &nid); - node_set_online(nid); + read_drconf_cell(&drmem, &dm); - size = numa_enforce_memory_limit(start, lmb_size); - if (!size) + /* skip this block if the reserved bit is set in flags (0x80) + or if the block is not assigned to this partition (0x8) */ + if ((drmem.flags & DRCONF_MEM_RESERVED) + || !(drmem.flags & DRCONF_MEM_ASSIGNED)) continue; - add_active_range(nid, start >> PAGE_SHIFT, - (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT)); + base = drmem.base_addr; + size = lmb_size; + ranges = 1; + + if (is_kexec_kdump) { + ranges = read_usm_ranges(&usm); + if (!ranges) /* there are no (base, size) duple */ + continue; + } + do { + if (is_kexec_kdump) { + base = read_n_cells(n_mem_addr_cells, &usm); + size = read_n_cells(n_mem_size_cells, &usm); + } + nid = of_drconf_to_nid_single(&drmem, &aa); + fake_numa_create_new_node( + ((base + size) >> PAGE_SHIFT), + &nid); + node_set_online(nid); + sz = numa_enforce_memory_limit(base, size); + if (sz) + memblock_set_node(base, sz, + &memblock.memory, nid); + } while (--ranges); } } static int __init parse_numa_properties(void) { - struct device_node *cpu = NULL; - struct device_node *memory = NULL; + struct device_node *memory; int default_nid = 0; unsigned long i; @@ -432,9 +751,10 @@ static int __init parse_numa_properties(void) * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + struct device_node *cpu; int nid; - cpu = find_cpu_node(i); + cpu = of_get_cpu_node(i, NULL); BUG_ON(!cpu); nid = of_node_to_nid_single(cpu); of_node_put(cpu); @@ -450,13 +770,13 @@ static int __init parse_numa_properties(void) } get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + + for_each_node_by_type(memory, "memory") { unsigned long start; unsigned long size; int nid; int ranges; - const unsigned int *memcell_buf; + const __be32 *memcell_buf; unsigned int len; memcell_buf = of_get_property(memory, @@ -492,16 +812,16 @@ new_range: continue; } - add_active_range(nid, start >> PAGE_SHIFT, - (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT)); + memblock_set_node(start, size, &memblock.memory, nid); if (--ranges) goto new_range; } /* - * Now do the same thing for each LMB listed in the ibm,dynamic-memory - * property in the ibm,dynamic-reconfiguration-memory node. + * Now do the same thing for each MEMBLOCK listed in the + * ibm,dynamic-memory property in the + * ibm,dynamic-reconfiguration-memory node. */ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (memory) @@ -512,22 +832,25 @@ new_range: static void __init setup_nonnuma(void) { - unsigned long top_of_ram = lmb_end_of_DRAM(); - unsigned long total_ram = lmb_phys_mem_size(); + unsigned long top_of_ram = memblock_end_of_DRAM(); + unsigned long total_ram = memblock_phys_mem_size(); unsigned long start_pfn, end_pfn; - unsigned int i, nid = 0; + unsigned int nid = 0; + struct memblock_region *reg; printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); printk(KERN_DEBUG "Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); - for (i = 0; i < lmb.memory.cnt; ++i) { - start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; - end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i); + for_each_memblock(memory, reg) { + start_pfn = memblock_region_memory_base_pfn(reg); + end_pfn = memblock_region_memory_end_pfn(reg); fake_numa_create_new_node(end_pfn, &nid); - add_active_range(nid, start_pfn, end_pfn); + memblock_set_node(PFN_PHYS(start_pfn), + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, nid); node_set_online(nid); } } @@ -548,8 +871,9 @@ void __init dump_numa_cpu_topology(void) * If we used a CPU iterator here we would miss printing * the holes in the cpumap. */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { - if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + if (cpumask_test_cpu(cpu, + node_to_cpumask_map[node])) { if (count == 0) printk(" %u", cpu); ++count; @@ -561,7 +885,7 @@ void __init dump_numa_cpu_topology(void) } if (count > 1) - printk("-%u", NR_CPUS - 1); + printk("-%u", nr_cpu_ids - 1); printk("\n"); } } @@ -581,7 +905,7 @@ static void __init dump_numa_memory_topology(void) count = 0; - for (i = 0; i < lmb_end_of_DRAM(); + for (i = 0; i < memblock_end_of_DRAM(); i += (1 << SECTION_SIZE_BITS)) { if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { if (count == 0) @@ -601,60 +925,134 @@ static void __init dump_numa_memory_topology(void) } /* - * Allocate some memory, satisfying the lmb or bootmem allocator where + * Allocate some memory, satisfying the memblock or bootmem allocator where * required. nid is the preferred node and end is the physical address of * the highest address in the node. * - * Returns the physical address of the memory. + * Returns the virtual address of the memory. */ -static void __init *careful_allocation(int nid, unsigned long size, +static void __init *careful_zallocation(int nid, unsigned long size, unsigned long align, unsigned long end_pfn) { + void *ret; int new_nid; - unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT); + unsigned long ret_paddr; + + ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT); /* retry over all memory */ - if (!ret) - ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM()); + if (!ret_paddr) + ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM()); - if (!ret) - panic("numa.c: cannot allocate %lu bytes on node %d", + if (!ret_paddr) + panic("numa.c: cannot allocate %lu bytes for node %d", size, nid); + ret = __va(ret_paddr); + /* - * If the memory came from a previously allocated node, we must - * retry with the bootmem allocator. + * We initialize the nodes in numeric order: 0, 1, 2... + * and hand over control from the MEMBLOCK allocator to the + * bootmem allocator. If this function is called for + * node 5, then we know that all nodes <5 are using the + * bootmem allocator instead of the MEMBLOCK allocator. + * + * So, check the nid from which this allocation came + * and double check to see if we need to use bootmem + * instead of the MEMBLOCK. We don't free the MEMBLOCK memory + * since it would be useless. */ - new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT); + new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT); if (new_nid < nid) { - ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid), + ret = __alloc_bootmem_node(NODE_DATA(new_nid), size, align, 0); - if (!ret) - panic("numa.c: cannot allocate %lu bytes on node %d", - size, new_nid); - - ret = __pa(ret); - - dbg("alloc_bootmem %lx %lx\n", ret, size); + dbg("alloc_bootmem %p %lx\n", ret, size); } - return (void *)ret; + memset(ret, 0, size); + return ret; } -static struct notifier_block __cpuinitdata ppc64_numa_nb = { +static struct notifier_block ppc64_numa_nb = { .notifier_call = cpu_numa_callback, .priority = 1 /* Must run before sched domains notifier. */ }; +static void __init mark_reserved_regions_for_nid(int nid) +{ + struct pglist_data *node = NODE_DATA(nid); + struct memblock_region *reg; + + for_each_memblock(reserved, reg) { + unsigned long physbase = reg->base; + unsigned long size = reg->size; + unsigned long start_pfn = physbase >> PAGE_SHIFT; + unsigned long end_pfn = PFN_UP(physbase + size); + struct node_active_region node_ar; + unsigned long node_end_pfn = pgdat_end_pfn(node); + + /* + * Check to make sure that this memblock.reserved area is + * within the bounds of the node that we care about. + * Checking the nid of the start and end points is not + * sufficient because the reserved area could span the + * entire node. + */ + if (end_pfn <= node->node_start_pfn || + start_pfn >= node_end_pfn) + continue; + + get_node_active_region(start_pfn, &node_ar); + while (start_pfn < end_pfn && + node_ar.start_pfn < node_ar.end_pfn) { + unsigned long reserve_size = size; + /* + * if reserved region extends past active region + * then trim size to active region + */ + if (end_pfn > node_ar.end_pfn) + reserve_size = (node_ar.end_pfn << PAGE_SHIFT) + - physbase; + /* + * Only worry about *this* node, others may not + * yet have valid NODE_DATA(). + */ + if (node_ar.nid == nid) { + dbg("reserve_bootmem %lx %lx nid=%d\n", + physbase, reserve_size, node_ar.nid); + reserve_bootmem_node(NODE_DATA(node_ar.nid), + physbase, reserve_size, + BOOTMEM_DEFAULT); + } + /* + * if reserved region is contained in the active region + * then done. + */ + if (end_pfn <= node_ar.end_pfn) + break; + + /* + * reserved region extends past the active region + * get next active region that contains this + * reserved region + */ + start_pfn = node_ar.end_pfn; + physbase = start_pfn << PAGE_SHIFT; + size = size - reserve_size; + get_node_active_region(start_pfn, &node_ar); + } + } +} + + void __init do_init_bootmem(void) { int nid; - unsigned int i; min_low_pfn = 0; - max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; + max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; max_pfn = max_low_pfn; if (parse_numa_properties()) @@ -662,28 +1060,28 @@ void __init do_init_bootmem(void) else dump_numa_memory_topology(); - register_cpu_notifier(&ppc64_numa_nb); - cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, - (void *)(unsigned long)boot_cpuid); - for_each_online_node(nid) { unsigned long start_pfn, end_pfn; - unsigned long bootmem_paddr; + void *bootmem_vaddr; unsigned long bootmap_pages; get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); - /* Allocate the node structure node local if possible */ - NODE_DATA(nid) = careful_allocation(nid, + /* + * Allocate the node structure node local if possible + * + * Be careful moving this around, as it relies on all + * previous nodes' bootmem to be initialized and have + * all reserved areas marked. + */ + NODE_DATA(nid) = careful_zallocation(nid, sizeof(struct pglist_data), SMP_CACHE_BYTES, end_pfn); - NODE_DATA(nid) = __va(NODE_DATA(nid)); - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); dbg("node %d\n", nid); dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); - NODE_DATA(nid)->bdata = &plat_node_bdata[nid]; + NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; NODE_DATA(nid)->node_start_pfn = start_pfn; NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; @@ -694,56 +1092,45 @@ void __init do_init_bootmem(void) dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); - bootmem_paddr = (unsigned long)careful_allocation(nid, + bootmem_vaddr = careful_zallocation(nid, bootmap_pages << PAGE_SHIFT, PAGE_SIZE, end_pfn); - memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT); - dbg("bootmap_paddr = %lx\n", bootmem_paddr); + dbg("bootmap_vaddr = %p\n", bootmem_vaddr); - init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, + init_bootmem_node(NODE_DATA(nid), + __pa(bootmem_vaddr) >> PAGE_SHIFT, start_pfn, end_pfn); free_bootmem_with_active_regions(nid, end_pfn); + /* + * Be very careful about moving this around. Future + * calls to careful_zallocation() depend on this getting + * done correctly. + */ + mark_reserved_regions_for_nid(nid); + sparse_memory_present_with_active_regions(nid); + } - /* Mark reserved regions on this node */ - for (i = 0; i < lmb.reserved.cnt; i++) { - unsigned long physbase = lmb.reserved.region[i].base; - unsigned long size = lmb.reserved.region[i].size; - unsigned long start_paddr = start_pfn << PAGE_SHIFT; - unsigned long end_paddr = end_pfn << PAGE_SHIFT; - - if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid && - early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid) - continue; + init_bootmem_done = 1; - if (physbase < end_paddr && - (physbase+size) > start_paddr) { - /* overlaps */ - if (physbase < start_paddr) { - size -= start_paddr - physbase; - physbase = start_paddr; - } - - if (size > end_paddr - physbase) - size = end_paddr - physbase; - - dbg("reserve_bootmem %lx %lx\n", physbase, - size); - reserve_bootmem_node(NODE_DATA(nid), physbase, - size, BOOTMEM_DEFAULT); - } - } + /* + * Now bootmem is initialised we can create the node to cpumask + * lookup tables and setup the cpu callback to populate them. + */ + setup_node_to_cpumask_map(); - sparse_memory_present_with_active_regions(nid); - } + reset_numa_cpu_lookup_table(); + register_cpu_notifier(&ppc64_numa_nb); + cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, + (void *)(unsigned long)boot_cpuid); } void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT; + max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT; free_area_init_nodes(max_zone_pfns); } @@ -768,24 +1155,67 @@ early_param("numa", early_numa); #ifdef CONFIG_MEMORY_HOTPLUG /* - * Find the node associated with a hot added memory section. Section - * corresponds to a SPARSEMEM section, not an LMB. It is assumed that - * sections are fully contained within a single LMB. + * Find the node associated with a hot added memory section for + * memory represented in the device tree by the property + * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. */ -int hot_add_scn_to_nid(unsigned long scn_addr) +static int hot_add_drconf_scn_to_nid(struct device_node *memory, + unsigned long scn_addr) { - struct device_node *memory = NULL; - nodemask_t nodes; - int default_nid = any_online_node(NODE_MASK_ALL); - int nid; + const __be32 *dm; + unsigned int drconf_cell_cnt, rc; + unsigned long lmb_size; + struct assoc_arrays aa; + int nid = -1; - if (!numa_enabled || (min_common_depth < 0)) - return default_nid; + drconf_cell_cnt = of_get_drconf_memory(memory, &dm); + if (!drconf_cell_cnt) + return -1; + + lmb_size = of_get_lmb_size(memory); + if (!lmb_size) + return -1; + + rc = of_get_assoc_arrays(memory, &aa); + if (rc) + return -1; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + for (; drconf_cell_cnt != 0; --drconf_cell_cnt) { + struct of_drconf_cell drmem; + + read_drconf_cell(&drmem, &dm); + + /* skip this block if it is reserved or not assigned to + * this partition */ + if ((drmem.flags & DRCONF_MEM_RESERVED) + || !(drmem.flags & DRCONF_MEM_ASSIGNED)) + continue; + + if ((scn_addr < drmem.base_addr) + || (scn_addr >= (drmem.base_addr + lmb_size))) + continue; + + nid = of_drconf_to_nid_single(&drmem, &aa); + break; + } + + return nid; +} + +/* + * Find the node associated with a hot added memory section for memory + * represented in the device tree as a node (i.e. memory@XXXX) for + * each memblock. + */ +static int hot_add_node_scn_to_nid(unsigned long scn_addr) +{ + struct device_node *memory; + int nid = -1; + + for_each_node_by_type(memory, "memory") { unsigned long start, size; int ranges; - const unsigned int *memcell_buf; + const __be32 *memcell_buf; unsigned int len; memcell_buf = of_get_property(memory, "reg", &len); @@ -794,33 +1224,586 @@ int hot_add_scn_to_nid(unsigned long scn_addr) /* ranges in cell */ ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); -ha_new_range: - start = read_n_cells(n_mem_addr_cells, &memcell_buf); - size = read_n_cells(n_mem_size_cells, &memcell_buf); - nid = of_node_to_nid_single(memory); - /* Domains not present at boot default to 0 */ - if (nid < 0 || !node_online(nid)) - nid = default_nid; + while (ranges--) { + start = read_n_cells(n_mem_addr_cells, &memcell_buf); + size = read_n_cells(n_mem_size_cells, &memcell_buf); + + if ((scn_addr < start) || (scn_addr >= (start + size))) + continue; - if ((scn_addr >= start) && (scn_addr < (start + size))) { - of_node_put(memory); - goto got_nid; + nid = of_node_to_nid_single(memory); + break; } - if (--ranges) /* process all ranges in cell */ - goto ha_new_range; + if (nid >= 0) + break; } - BUG(); /* section address should be found above */ - return 0; - /* Temporary code to ensure that returned node is not empty */ -got_nid: - nodes_setall(nodes); - while (NODE_DATA(nid)->node_spanned_pages == 0) { - node_clear(nid, nodes); - nid = any_online_node(nodes); + of_node_put(memory); + + return nid; +} + +/* + * Find the node associated with a hot added memory section. Section + * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that + * sections are fully contained within a single MEMBLOCK. + */ +int hot_add_scn_to_nid(unsigned long scn_addr) +{ + struct device_node *memory = NULL; + int nid, found = 0; + + if (!numa_enabled || (min_common_depth < 0)) + return first_online_node; + + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) { + nid = hot_add_drconf_scn_to_nid(memory, scn_addr); + of_node_put(memory); + } else { + nid = hot_add_node_scn_to_nid(scn_addr); } + + if (nid < 0 || !node_online(nid)) + nid = first_online_node; + + if (NODE_DATA(nid)->node_spanned_pages) + return nid; + + for_each_online_node(nid) { + if (NODE_DATA(nid)->node_spanned_pages) { + found = 1; + break; + } + } + + BUG_ON(!found); return nid; } + +static u64 hot_add_drconf_memory_max(void) +{ + struct device_node *memory = NULL; + unsigned int drconf_cell_cnt = 0; + u64 lmb_size = 0; + const __be32 *dm = NULL; + + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) { + drconf_cell_cnt = of_get_drconf_memory(memory, &dm); + lmb_size = of_get_lmb_size(memory); + of_node_put(memory); + } + return lmb_size * drconf_cell_cnt; +} + +/* + * memory_hotplug_max - return max address of memory that may be added + * + * This is currently only used on systems that support drconfig memory + * hotplug. + */ +u64 memory_hotplug_max(void) +{ + return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); +} #endif /* CONFIG_MEMORY_HOTPLUG */ + +/* Virtual Processor Home Node (VPHN) support */ +#ifdef CONFIG_PPC_SPLPAR +struct topology_update_data { + struct topology_update_data *next; + unsigned int cpu; + int old_nid; + int new_nid; +}; + +static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; +static cpumask_t cpu_associativity_changes_mask; +static int vphn_enabled; +static int prrn_enabled; +static void reset_topology_timer(void); + +/* + * Store the current values of the associativity change counters in the + * hypervisor. + */ +static void setup_cpu_associativity_change_counters(void) +{ + int cpu; + + /* The VPHN feature supports a maximum of 8 reference points */ + BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8); + + for_each_possible_cpu(cpu) { + int i; + u8 *counts = vphn_cpu_change_counts[cpu]; + volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + + for (i = 0; i < distance_ref_points_depth; i++) + counts[i] = hypervisor_counts[i]; + } +} + +/* + * The hypervisor maintains a set of 8 associativity change counters in + * the VPA of each cpu that correspond to the associativity levels in the + * ibm,associativity-reference-points property. When an associativity + * level changes, the corresponding counter is incremented. + * + * Set a bit in cpu_associativity_changes_mask for each cpu whose home + * node associativity levels have changed. + * + * Returns the number of cpus with unhandled associativity changes. + */ +static int update_cpu_associativity_changes_mask(void) +{ + int cpu; + cpumask_t *changes = &cpu_associativity_changes_mask; + + for_each_possible_cpu(cpu) { + int i, changed = 0; + u8 *counts = vphn_cpu_change_counts[cpu]; + volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + + for (i = 0; i < distance_ref_points_depth; i++) { + if (hypervisor_counts[i] != counts[i]) { + counts[i] = hypervisor_counts[i]; + changed = 1; + } + } + if (changed) { + cpumask_or(changes, changes, cpu_sibling_mask(cpu)); + cpu = cpu_last_thread_sibling(cpu); + } + } + + return cpumask_weight(changes); +} + +/* + * 6 64-bit registers unpacked into 12 32-bit associativity values. To form + * the complete property we have to add the length in the first cell. + */ +#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1) + +/* + * Convert the associativity domain numbers returned from the hypervisor + * to the sequence they would appear in the ibm,associativity property. + */ +static int vphn_unpack_associativity(const long *packed, __be32 *unpacked) +{ + int i, nr_assoc_doms = 0; + const __be16 *field = (const __be16 *) packed; + +#define VPHN_FIELD_UNUSED (0xffff) +#define VPHN_FIELD_MSB (0x8000) +#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) + + for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { + if (be16_to_cpup(field) == VPHN_FIELD_UNUSED) { + /* All significant fields processed, and remaining + * fields contain the reserved value of all 1's. + * Just store them. + */ + unpacked[i] = *((__be32 *)field); + field += 2; + } else if (be16_to_cpup(field) & VPHN_FIELD_MSB) { + /* Data is in the lower 15 bits of this field */ + unpacked[i] = cpu_to_be32( + be16_to_cpup(field) & VPHN_FIELD_MASK); + field++; + nr_assoc_doms++; + } else { + /* Data is in the lower 15 bits of this field + * concatenated with the next 16 bit field + */ + unpacked[i] = *((__be32 *)field); + field += 2; + nr_assoc_doms++; + } + } + + /* The first cell contains the length of the property */ + unpacked[0] = cpu_to_be32(nr_assoc_doms); + + return nr_assoc_doms; +} + +/* + * Retrieve the new associativity information for a virtual processor's + * home node. + */ +static long hcall_vphn(unsigned long cpu, __be32 *associativity) +{ + long rc; + long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + u64 flags = 1; + int hwcpu = get_hard_smp_processor_id(cpu); + + rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); + vphn_unpack_associativity(retbuf, associativity); + + return rc; +} + +static long vphn_get_associativity(unsigned long cpu, + __be32 *associativity) +{ + long rc; + + rc = hcall_vphn(cpu, associativity); + + switch (rc) { + case H_FUNCTION: + printk(KERN_INFO + "VPHN is not supported. Disabling polling...\n"); + stop_topology_update(); + break; + case H_HARDWARE: + printk(KERN_ERR + "hcall_vphn() experienced a hardware fault " + "preventing VPHN. Disabling polling...\n"); + stop_topology_update(); + } + + return rc; +} + +/* + * Update the CPU maps and sysfs entries for a single CPU when its NUMA + * characteristics change. This function doesn't perform any locking and is + * only safe to call from stop_machine(). + */ +static int update_cpu_topology(void *data) +{ + struct topology_update_data *update; + unsigned long cpu; + + if (!data) + return -EINVAL; + + cpu = smp_processor_id(); + + for (update = data; update; update = update->next) { + if (cpu != update->cpu) + continue; + + unmap_cpu_from_node(update->cpu); + map_cpu_to_node(update->cpu, update->new_nid); + vdso_getcpu_init(); + } + + return 0; +} + +static int update_lookup_table(void *data) +{ + struct topology_update_data *update; + + if (!data) + return -EINVAL; + + /* + * Upon topology update, the numa-cpu lookup table needs to be updated + * for all threads in the core, including offline CPUs, to ensure that + * future hotplug operations respect the cpu-to-node associativity + * properly. + */ + for (update = data; update; update = update->next) { + int nid, base, j; + + nid = update->new_nid; + base = cpu_first_thread_sibling(update->cpu); + + for (j = 0; j < threads_per_core; j++) { + update_numa_cpu_lookup_table(base + j, nid); + } + } + + return 0; +} + +/* + * Update the node maps and sysfs entries for each cpu whose home node + * has changed. Returns 1 when the topology has changed, and 0 otherwise. + */ +int arch_update_cpu_topology(void) +{ + unsigned int cpu, sibling, changed = 0; + struct topology_update_data *updates, *ud; + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + cpumask_t updated_cpus; + struct device *dev; + int weight, new_nid, i = 0; + + weight = cpumask_weight(&cpu_associativity_changes_mask); + if (!weight) + return 0; + + updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL); + if (!updates) + return 0; + + cpumask_clear(&updated_cpus); + + for_each_cpu(cpu, &cpu_associativity_changes_mask) { + /* + * If siblings aren't flagged for changes, updates list + * will be too short. Skip on this update and set for next + * update. + */ + if (!cpumask_subset(cpu_sibling_mask(cpu), + &cpu_associativity_changes_mask)) { + pr_info("Sibling bits not set for associativity " + "change, cpu%d\n", cpu); + cpumask_or(&cpu_associativity_changes_mask, + &cpu_associativity_changes_mask, + cpu_sibling_mask(cpu)); + cpu = cpu_last_thread_sibling(cpu); + continue; + } + + /* Use associativity from first thread for all siblings */ + vphn_get_associativity(cpu, associativity); + new_nid = associativity_to_nid(associativity); + if (new_nid < 0 || !node_online(new_nid)) + new_nid = first_online_node; + + if (new_nid == numa_cpu_lookup_table[cpu]) { + cpumask_andnot(&cpu_associativity_changes_mask, + &cpu_associativity_changes_mask, + cpu_sibling_mask(cpu)); + cpu = cpu_last_thread_sibling(cpu); + continue; + } + + for_each_cpu(sibling, cpu_sibling_mask(cpu)) { + ud = &updates[i++]; + ud->cpu = sibling; + ud->new_nid = new_nid; + ud->old_nid = numa_cpu_lookup_table[sibling]; + cpumask_set_cpu(sibling, &updated_cpus); + if (i < weight) + ud->next = &updates[i]; + } + cpu = cpu_last_thread_sibling(cpu); + } + + /* + * In cases where we have nothing to update (because the updates list + * is too short or because the new topology is same as the old one), + * skip invoking update_cpu_topology() via stop-machine(). This is + * necessary (and not just a fast-path optimization) since stop-machine + * can end up electing a random CPU to run update_cpu_topology(), and + * thus trick us into setting up incorrect cpu-node mappings (since + * 'updates' is kzalloc()'ed). + * + * And for the similar reason, we will skip all the following updating. + */ + if (!cpumask_weight(&updated_cpus)) + goto out; + + stop_machine(update_cpu_topology, &updates[0], &updated_cpus); + + /* + * Update the numa-cpu lookup table with the new mappings, even for + * offline CPUs. It is best to perform this update from the stop- + * machine context. + */ + stop_machine(update_lookup_table, &updates[0], + cpumask_of(raw_smp_processor_id())); + + for (ud = &updates[0]; ud; ud = ud->next) { + unregister_cpu_under_node(ud->cpu, ud->old_nid); + register_cpu_under_node(ud->cpu, ud->new_nid); + + dev = get_cpu_device(ud->cpu); + if (dev) + kobject_uevent(&dev->kobj, KOBJ_CHANGE); + cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask); + changed = 1; + } + +out: + kfree(updates); + return changed; +} + +static void topology_work_fn(struct work_struct *work) +{ + rebuild_sched_domains(); +} +static DECLARE_WORK(topology_work, topology_work_fn); + +static void topology_schedule_update(void) +{ + schedule_work(&topology_work); +} + +static void topology_timer_fn(unsigned long ignored) +{ + if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask)) + topology_schedule_update(); + else if (vphn_enabled) { + if (update_cpu_associativity_changes_mask() > 0) + topology_schedule_update(); + reset_topology_timer(); + } +} +static struct timer_list topology_timer = + TIMER_INITIALIZER(topology_timer_fn, 0, 0); + +static void reset_topology_timer(void) +{ + topology_timer.data = 0; + topology_timer.expires = jiffies + 60 * HZ; + mod_timer(&topology_timer, topology_timer.expires); +} + +#ifdef CONFIG_SMP + +static void stage_topology_update(int core_id) +{ + cpumask_or(&cpu_associativity_changes_mask, + &cpu_associativity_changes_mask, cpu_sibling_mask(core_id)); + reset_topology_timer(); +} + +static int dt_update_callback(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct of_prop_reconfig *update; + int rc = NOTIFY_DONE; + + switch (action) { + case OF_RECONFIG_UPDATE_PROPERTY: + update = (struct of_prop_reconfig *)data; + if (!of_prop_cmp(update->dn->type, "cpu") && + !of_prop_cmp(update->prop->name, "ibm,associativity")) { + u32 core_id; + of_property_read_u32(update->dn, "reg", &core_id); + stage_topology_update(core_id); + rc = NOTIFY_OK; + } + break; + } + + return rc; +} + +static struct notifier_block dt_update_nb = { + .notifier_call = dt_update_callback, +}; + +#endif + +/* + * Start polling for associativity changes. + */ +int start_topology_update(void) +{ + int rc = 0; + + if (firmware_has_feature(FW_FEATURE_PRRN)) { + if (!prrn_enabled) { + prrn_enabled = 1; + vphn_enabled = 0; +#ifdef CONFIG_SMP + rc = of_reconfig_notifier_register(&dt_update_nb); +#endif + } + } else if (firmware_has_feature(FW_FEATURE_VPHN) && + lppaca_shared_proc(get_lppaca())) { + if (!vphn_enabled) { + prrn_enabled = 0; + vphn_enabled = 1; + setup_cpu_associativity_change_counters(); + init_timer_deferrable(&topology_timer); + reset_topology_timer(); + } + } + + return rc; +} + +/* + * Disable polling for VPHN associativity changes. + */ +int stop_topology_update(void) +{ + int rc = 0; + + if (prrn_enabled) { + prrn_enabled = 0; +#ifdef CONFIG_SMP + rc = of_reconfig_notifier_unregister(&dt_update_nb); +#endif + } else if (vphn_enabled) { + vphn_enabled = 0; + rc = del_timer_sync(&topology_timer); + } + + return rc; +} + +int prrn_is_enabled(void) +{ + return prrn_enabled; +} + +static int topology_read(struct seq_file *file, void *v) +{ + if (vphn_enabled || prrn_enabled) + seq_puts(file, "on\n"); + else + seq_puts(file, "off\n"); + + return 0; +} + +static int topology_open(struct inode *inode, struct file *file) +{ + return single_open(file, topology_read, NULL); +} + +static ssize_t topology_write(struct file *file, const char __user *buf, + size_t count, loff_t *off) +{ + char kbuf[4]; /* "on" or "off" plus null. */ + int read_len; + + read_len = count < 3 ? count : 3; + if (copy_from_user(kbuf, buf, read_len)) + return -EINVAL; + + kbuf[read_len] = '\0'; + + if (!strncmp(kbuf, "on", 2)) + start_topology_update(); + else if (!strncmp(kbuf, "off", 3)) + stop_topology_update(); + else + return -EINVAL; + + return count; +} + +static const struct file_operations topology_ops = { + .read = seq_read, + .write = topology_write, + .open = topology_open, + .release = single_release +}; + +static int topology_update_init(void) +{ + start_topology_update(); + proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops); + + return 0; +} +device_initcall(topology_update_init); +#endif /* CONFIG_PPC_SPLPAR */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c new file mode 100644 index 00000000000..c695943a513 --- /dev/null +++ b/arch/powerpc/mm/pgtable.c @@ -0,0 +1,236 @@ +/* + * This file contains common routines for dealing with free of page tables + * Along with common page table handling code + * + * Derived from arch/powerpc/mm/tlb_64.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen <engebret@us.ibm.com> + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/hugetlb.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> + +static inline int is_exec_fault(void) +{ + return current->thread.regs && TRAP(current->thread.regs) == 0x400; +} + +/* We only try to do i/d cache coherency on stuff that looks like + * reasonably "normal" PTEs. We currently require a PTE to be present + * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * on userspace PTEs + */ +static inline int pte_looks_normal(pte_t pte) +{ + return (pte_val(pte) & + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); +} + +struct page * maybe_pte_to_page(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + struct page *page; + + if (unlikely(!pfn_valid(pfn))) + return NULL; + page = pfn_to_page(pfn); + if (PageReserved(page)) + return NULL; + return page; +} + +#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 + +/* Server-style MMU handles coherency when hashing if HW exec permission + * is supposed per page (currently 64-bit only). If not, then, we always + * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec + * support falls into the same category. + */ + +static pte_t set_pte_filter(pte_t pte) +{ + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); + if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { + struct page *pg = maybe_pte_to_page(pte); + if (!pg) + return pte; + if (!test_bit(PG_arch_1, &pg->flags)) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + } + } + return pte; +} + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) +{ + return pte; +} + +#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ + +/* Embedded type MMU with HW exec support. This is a bit more complicated + * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so + * instead we "filter out" the exec permission for non clean pages. + */ +static pte_t set_pte_filter(pte_t pte) +{ + struct page *pg; + + /* No exec permission in the first place, move on */ + if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + return pte; + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + return pte; + + /* If the page clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + return pte; + + /* If it's an exec fault, we flush the cache and make it clean */ + if (is_exec_fault()) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + return pte; + } + + /* Else, we filter out _PAGE_EXEC */ + return __pte(pte_val(pte) & ~_PAGE_EXEC); +} + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) +{ + struct page *pg; + + /* So here, we only care about exec faults, as we use them + * to recover lost _PAGE_EXEC and perform I$/D$ coherency + * if necessary. Also if _PAGE_EXEC is already set, same deal, + * we just bail out + */ + if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + return pte; + +#ifdef CONFIG_DEBUG_VM + /* So this is an exec fault, _PAGE_EXEC is not set. If it was + * an error we would have bailed out earlier in do_page_fault() + * but let's make sure of it + */ + if (WARN_ON(!(vma->vm_flags & VM_EXEC))) + return pte; +#endif /* CONFIG_DEBUG_VM */ + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + goto bail; + + /* If the page is already clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + goto bail; + + /* Clean the page and set PG_arch_1 */ + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + + bail: + return __pte(pte_val(pte) | _PAGE_EXEC); +} + +#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ + +/* + * set_pte stores a linux PTE into the linux page table. + */ +void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte) +{ +#ifdef CONFIG_DEBUG_VM + WARN_ON(pte_val(*ptep) & _PAGE_PRESENT); +#endif + /* Note: mm->context.id might not yet have been assigned as + * this context might not have been activated yet when this + * is called. + */ + pte = set_pte_filter(pte); + + /* Perform the setting of the PTE */ + __set_pte_at(mm, addr, ptep, pte, 0); +} + +/* + * This is called when relaxing access to a PTE. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, pte_t entry, int dirty) +{ + int changed; + entry = set_access_flags_filter(entry, vma, dirty); + changed = !pte_same(*(ptep), entry); + if (changed) { + if (!is_vm_hugetlb_page(vma)) + assert_pte_locked(vma->vm_mm, address); + __ptep_set_access_flags(ptep, entry); + flush_tlb_page_nohash(vma, address); + } + return changed; +} + +#ifdef CONFIG_DEBUG_VM +void assert_pte_locked(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (mm == &init_mm) + return; + pgd = mm->pgd + pgd_index(addr); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, addr); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, addr); + /* + * khugepaged to collapse normal pages to hugepage, first set + * pmd to none to force page fault/gup to take mmap_sem. After + * pmd is set to none, we do a pte_clear which does this assertion + * so if we find pmd none, return. + */ + if (pmd_none(*pmd)) + return; + BUG_ON(!pmd_present(*pmd)); + assert_spin_locked(pte_lockptr(mm, pmd)); +} +#endif /* CONFIG_DEBUG_VM */ + diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index ac3390f8190..343a87fa78b 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -26,10 +26,14 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/memblock.h> +#include <linux/slab.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> +#include <asm/fixmap.h> #include <asm/io.h> +#include <asm/setup.h> #include "mmu_decl.h" @@ -47,14 +51,10 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ extern char etext[], _stext[]; -#ifdef CONFIG_SMP -extern void hash_page_sync(void); -#endif - #ifdef HAVE_BATS -extern unsigned long v_mapped_by_bats(unsigned long va); -extern unsigned long p_mapped_by_bats(unsigned long pa); -void setbat(int index, unsigned long virt, unsigned long phys, +extern phys_addr_t v_mapped_by_bats(unsigned long va); +extern unsigned long p_mapped_by_bats(phys_addr_t pa); +void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, int flags); #else /* !HAVE_BATS */ @@ -64,31 +64,36 @@ void setbat(int index, unsigned long virt, unsigned long phys, #ifdef HAVE_TLBCAM extern unsigned int tlbcam_index; -extern unsigned long v_mapped_by_tlbcam(unsigned long va); -extern unsigned long p_mapped_by_tlbcam(unsigned long pa); +extern phys_addr_t v_mapped_by_tlbcam(unsigned long va); +extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa); #else /* !HAVE_TLBCAM */ #define v_mapped_by_tlbcam(x) (0UL) #define p_mapped_by_tlbcam(x) (0UL) #endif /* HAVE_TLBCAM */ -#ifdef CONFIG_PTE_64BIT -/* 44x uses an 8kB pgdir because it has 8-byte Linux PTEs. */ -#define PGDIR_ORDER 1 -#else -#define PGDIR_ORDER 0 -#endif +#define PGDIR_ORDER (32 + PGD_T_LOG2 - PGDIR_SHIFT) pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret; - ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); + /* pgdir take page or two with 4K pages and a page fraction otherwise */ +#ifndef CONFIG_PPC_4K_PAGES + ret = kzalloc(1 << PGDIR_ORDER, GFP_KERNEL); +#else + ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, + PGDIR_ORDER - PAGE_SHIFT); +#endif return ret; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_pages((unsigned long)pgd, PGDIR_ORDER); +#ifndef CONFIG_PPC_4K_PAGES + kfree((void *)pgd); +#else + free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT); +#endif } __init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) @@ -111,57 +116,78 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *ptepage; -#ifdef CONFIG_HIGHPTE - gfp_t flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT | __GFP_ZERO; -#else gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO; -#endif ptepage = alloc_pages(flags, 0); if (!ptepage) return NULL; - pgtable_page_ctor(ptepage); + if (!pgtable_page_ctor(ptepage)) { + __free_page(ptepage); + return NULL; + } return ptepage; } -void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +void __iomem * +ioremap(phys_addr_t addr, unsigned long size) { -#ifdef CONFIG_SMP - hash_page_sync(); -#endif - free_page((unsigned long)pte); + return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED, + __builtin_return_address(0)); } +EXPORT_SYMBOL(ioremap); -void pte_free(struct mm_struct *mm, pgtable_t ptepage) +void __iomem * +ioremap_wc(phys_addr_t addr, unsigned long size) { -#ifdef CONFIG_SMP - hash_page_sync(); -#endif - pgtable_page_dtor(ptepage); - __free_page(ptepage); + return __ioremap_caller(addr, size, _PAGE_NO_CACHE, + __builtin_return_address(0)); } +EXPORT_SYMBOL(ioremap_wc); void __iomem * -ioremap(phys_addr_t addr, unsigned long size) +ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { - return __ioremap(addr, size, _PAGE_NO_CACHE); + /* writeable implies dirty for kernel addresses */ + if (flags & _PAGE_RW) + flags |= _PAGE_DIRTY | _PAGE_HWWRITE; + + /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ + flags &= ~(_PAGE_USER | _PAGE_EXEC); + +#ifdef _PAGE_BAP_SR + /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format + * which means that we just cleared supervisor access... oops ;-) This + * restores it + */ + flags |= _PAGE_BAP_SR; +#endif + + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } -EXPORT_SYMBOL(ioremap); +EXPORT_SYMBOL(ioremap_prot); void __iomem * -ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags) +__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { - return __ioremap(addr, size, flags); + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } -EXPORT_SYMBOL(ioremap_flags); void __iomem * -__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) +__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, + void *caller) { unsigned long v, i; phys_addr_t p; int err; + /* Make sure we have the base flags */ + if ((flags & _PAGE_PRESENT) == 0) + flags |= PAGE_KERNEL; + + /* Non-cacheable page cannot be coherent */ + if (flags & _PAGE_NO_CACHE) + flags &= ~_PAGE_COHERENT; + /* * Choose an address to map it to. * Once the vmalloc system is running, we use it. @@ -178,15 +204,18 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) if (p < 16*1024*1024) p += _ISA_MEM_BASE; +#ifndef CONFIG_CRASH_DUMP /* * Don't allow anybody to remap normal RAM that we're using. * mem_init() sets high_memory so only do the check after that. */ - if (mem_init_done && (p < virt_to_phys(high_memory))) { - printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n", + if (mem_init_done && (p < virt_to_phys(high_memory)) && + !(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) { + printk("__ioremap(): phys addr 0x%llx is RAM lr %pf\n", (unsigned long long)p, __builtin_return_address(0)); return NULL; } +#endif if (size == 0) return NULL; @@ -210,19 +239,15 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) if (mem_init_done) { struct vm_struct *area; - area = get_vm_area(size, VM_IOREMAP); + area = get_vm_area_caller(size, VM_IOREMAP, caller); if (area == 0) return NULL; + area->phys_addr = p; v = (unsigned long) area->addr; } else { v = (ioremap_bot -= size); } - if ((flags & _PAGE_PRESENT) == 0) - flags |= _PAGE_KERNEL; - if (flags & _PAGE_NO_CACHE) - flags |= _PAGE_GUARDED; - /* * Should check if it is a candidate for a BAT mapping */ @@ -269,27 +294,30 @@ int map_page(unsigned long va, phys_addr_t pa, int flags) /* The PTE should never be already set nor present in the * hash table */ - BUG_ON(pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)); + BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) && + flags); set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); } + smp_wmb(); return err; } /* - * Map in all of physical memory starting at KERNELBASE. + * Map in a chunk of physical memory starting at start. */ -void __init mapin_ram(void) +void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) { - unsigned long v, p, s, f; + unsigned long v, s, f; + phys_addr_t p; int ktext; - s = mmu_mapin_ram(); - v = KERNELBASE + s; - p = PPC_MEMSTART + s; - for (; s < total_lowmem; s += PAGE_SIZE) { + s = offset; + v = PAGE_OFFSET + s; + p = memstart_addr + s; + for (; s < top; s += PAGE_SIZE) { ktext = ((char *) v >= _stext && (char *) v < etext); - f = ktext ?_PAGE_RAM_TEXT : _PAGE_RAM; + f = ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL; map_page(v, p, f); #ifdef CONFIG_PPC_STD_MMU_32 if (ktext) @@ -300,6 +328,30 @@ void __init mapin_ram(void) } } +void __init mapin_ram(void) +{ + unsigned long s, top; + +#ifndef CONFIG_WII + top = total_lowmem; + s = mmu_mapin_ram(top); + __mapin_ram_chunk(s, top); +#else + if (!wii_hole_size) { + s = mmu_mapin_ram(total_lowmem); + __mapin_ram_chunk(s, total_lowmem); + } else { + top = wii_hole_start; + s = mmu_mapin_ram(top); + __mapin_ram_chunk(s, top); + + top = memblock_end_of_DRAM(); + s = wii_mmu_mapin_mem2(top); + __mapin_ram_chunk(s, top); + } +#endif +} + /* Scan the real Linux page tables and return a PTE pointer for * a virtual address in a context. * Returns true (1) if PTE was found, zero otherwise. The pointer to @@ -349,9 +401,9 @@ static int __change_page_attr(struct page *page, pgprot_t prot) return 0; if (!get_pteptr(&init_mm, address, &kpte, &kpmd)) return -EINVAL; - set_pte_at(&init_mm, address, kpte, mk_pte(page, prot)); + __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0); wmb(); - flush_HPTE(0, address, pmd_val(*kpmd)); + flush_tlb_page(NULL, address); pte_unmap(kpte); return 0; @@ -386,3 +438,23 @@ void kernel_map_pages(struct page *page, int numpages, int enable) change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); } #endif /* CONFIG_DEBUG_PAGEALLOC */ + +static int fixmaps; + +void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + + map_page(address, phys, pgprot_val(flags)); + fixmaps++; +} + +void __this_fixmap_does_not_exist(void) +{ + WARN_ON(1); +} diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 3ef0ad2f9ca..f6ce1f111f5 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -26,13 +26,16 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> +#include <linux/export.h> #include <linux/types.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/stddef.h> #include <linux/vmalloc.h> -#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/slab.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -47,27 +50,52 @@ #include <asm/processor.h> #include <asm/cputable.h> #include <asm/sections.h> -#include <asm/system.h> -#include <asm/abs_addr.h> #include <asm/firmware.h> #include "mmu_decl.h" +/* Some sanity checking */ +#if TASK_SIZE_USER64 > PGTABLE_RANGE +#error TASK_SIZE_USER64 exceeds pagetable range +#endif + +#ifdef CONFIG_PPC_STD_MMU_64 +#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) +#error TASK_SIZE_USER64 exceeds user VSID range +#endif +#endif + unsigned long ioremap_bot = IOREMAP_BASE; +#ifdef CONFIG_PPC_MMU_NOHASH +static void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + if (init_bootmem_done) + pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS)); + else + pt = __va(memblock_alloc_base(size, size, + __pa(MAX_DMA_ADDRESS))); + memset(pt, 0, size); + + return pt; +} +#endif /* CONFIG_PPC_MMU_NOHASH */ + /* - * map_io_page currently only called by __ioremap - * map_io_page adds an entry to the ioremap page table + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -static int map_io_page(unsigned long ea, unsigned long pa, int flags) +int map_kernel_page(unsigned long ea, unsigned long pa, int flags) { pgd_t *pgdp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; - if (mem_init_done) { + if (slab_is_available()) { pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) @@ -81,6 +109,35 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); } else { +#ifdef CONFIG_PPC_MMU_NOHASH + /* Warning ! This will blow up if bootmem is not initialized + * which our ppc64 code is keen to do that, we'll need to + * fix it and/or be more careful + */ + pgdp = pgd_offset_k(ea); +#ifdef PUD_TABLE_SIZE + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* PUD_TABLE_SIZE */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); +#else /* CONFIG_PPC_MMU_NOHASH */ /* * If the mm subsystem is not fully up, we cannot create a * linux page table entry for this mapping. Simply bolt an @@ -93,7 +150,20 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) "memory at %016lx !\n", pa); return -ENOMEM; } +#endif /* !CONFIG_PPC_MMU_NOHASH */ } + +#ifdef CONFIG_PPC_BOOK3E_64 + /* + * With hardware tablewalk, a sync is needed to ensure that + * subsequent accesses see the PTE we just wrote. Unlike userspace + * mappings, we can't tolerate spurious faults, so make sure + * the new PTE will be seen the first time. + */ + mb(); +#else + smp_wmb(); +#endif return 0; } @@ -107,15 +177,24 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, { unsigned long i; + /* Make sure we have the base flags */ if ((flags & _PAGE_PRESENT) == 0) flags |= pgprot_val(PAGE_KERNEL); + /* Non-cacheable page cannot be coherent */ + if (flags & _PAGE_NO_CACHE) + flags &= ~_PAGE_COHERENT; + + /* We don't support the 4K PFN hack with ioremap */ + if (flags & _PAGE_4K_PFN) + return NULL; + WARN_ON(pa & ~PAGE_MASK); WARN_ON(((unsigned long)ea) & ~PAGE_MASK); WARN_ON(size & ~PAGE_MASK); for (i = 0; i < size; i += PAGE_SIZE) - if (map_io_page((unsigned long)ea+i, pa+i, flags)) + if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) return NULL; return (void __iomem *)ea; @@ -135,8 +214,8 @@ void __iounmap_at(void *ea, unsigned long size) unmap_kernel_range((unsigned long)ea, size); } -void __iomem * __ioremap(phys_addr_t addr, unsigned long size, - unsigned long flags) +void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, + unsigned long flags, void *caller) { phys_addr_t paligned; void __iomem *ret; @@ -159,10 +238,13 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size, if (mem_init_done) { struct vm_struct *area; - area = __get_vm_area(size, VM_IOREMAP, - ioremap_bot, IOREMAP_END); + area = __get_vm_area_caller(size, VM_IOREMAP, + ioremap_bot, IOREMAP_END, + caller); if (area == NULL) return NULL; + + area->phys_addr = paligned; ret = __ioremap_at(paligned, area->addr, size, flags); if (!ret) vunmap(area->addr); @@ -177,22 +259,55 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size, return ret; } +void __iomem * __ioremap(phys_addr_t addr, unsigned long size, + unsigned long flags) +{ + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); +} void __iomem * ioremap(phys_addr_t addr, unsigned long size) { unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; + void *caller = __builtin_return_address(0); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags); - return __ioremap(addr, size, flags); + return ppc_md.ioremap(addr, size, flags, caller); + return __ioremap_caller(addr, size, flags, caller); } -void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size, +void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) +{ + unsigned long flags = _PAGE_NO_CACHE; + void *caller = __builtin_return_address(0); + + if (ppc_md.ioremap) + return ppc_md.ioremap(addr, size, flags, caller); + return __ioremap_caller(addr, size, flags, caller); +} + +void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { + void *caller = __builtin_return_address(0); + + /* writeable implies dirty for kernel addresses */ + if (flags & _PAGE_RW) + flags |= _PAGE_DIRTY; + + /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ + flags &= ~(_PAGE_USER | _PAGE_EXEC); + +#ifdef _PAGE_BAP_SR + /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format + * which means that we just cleared supervisor access... oops ;-) This + * restores it + */ + flags |= _PAGE_BAP_SR; +#endif + if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags); - return __ioremap(addr, size, flags); + return ppc_md.ioremap(addr, size, flags, caller); + return __ioremap_caller(addr, size, flags, caller); } @@ -226,9 +341,550 @@ void iounmap(volatile void __iomem *token) } EXPORT_SYMBOL(ioremap); -EXPORT_SYMBOL(ioremap_flags); +EXPORT_SYMBOL(ioremap_wc); +EXPORT_SYMBOL(ioremap_prot); EXPORT_SYMBOL(__ioremap); EXPORT_SYMBOL(__ioremap_at); EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(__iounmap); EXPORT_SYMBOL(__iounmap_at); + +/* + * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags + * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address. + */ +struct page *pmd_page(pmd_t pmd) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(pmd)) + return pfn_to_page(pmd_pfn(pmd)); +#endif + return virt_to_page(pmd_page_vaddr(pmd)); +} + +#ifdef CONFIG_PPC_64K_PAGES +static pte_t *get_from_cache(struct mm_struct *mm) +{ + void *pte_frag, *ret; + + spin_lock(&mm->page_table_lock); + ret = mm->context.pte_frag; + if (ret) { + pte_frag = ret + PTE_FRAG_SIZE; + /* + * If we have taken up all the fragments mark PTE page NULL + */ + if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) + pte_frag = NULL; + mm->context.pte_frag = pte_frag; + } + spin_unlock(&mm->page_table_lock); + return (pte_t *)ret; +} + +static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) +{ + void *ret = NULL; + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | + __GFP_REPEAT | __GFP_ZERO); + if (!page) + return NULL; + if (!kernel && !pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + + ret = page_address(page); + spin_lock(&mm->page_table_lock); + /* + * If we find pgtable_page set, we return + * the allocated page with single fragement + * count. + */ + if (likely(!mm->context.pte_frag)) { + atomic_set(&page->_count, PTE_FRAG_NR); + mm->context.pte_frag = ret + PTE_FRAG_SIZE; + } + spin_unlock(&mm->page_table_lock); + + return (pte_t *)ret; +} + +pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) +{ + pte_t *pte; + + pte = get_from_cache(mm); + if (pte) + return pte; + + return __alloc_for_cache(mm, kernel); +} + +void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) +{ + struct page *page = virt_to_page(table); + if (put_page_testzero(page)) { + if (!kernel) + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } +} + +#ifdef CONFIG_SMP +static void page_table_free_rcu(void *table) +{ + struct page *page = virt_to_page(table); + if (put_page_testzero(page)) { + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } +} + +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) +{ + unsigned long pgf = (unsigned long)table; + + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf |= shift; + tlb_remove_table(tlb, (void *)pgf); +} + +void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + if (!shift) + /* PTE page needs special handling */ + page_table_free_rcu(table); + else { + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + kmem_cache_free(PGT_CACHE(shift), table); + } +} +#else +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) +{ + if (!shift) { + /* PTE page needs special handling */ + struct page *page = virt_to_page(table); + if (put_page_testzero(page)) { + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } + } else { + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + kmem_cache_free(PGT_CACHE(shift), table); + } +} +#endif +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); + /* + * Since we are not supporting SW TLB systems, we don't + * have any thing similar to flush_tlb_page_nohash() + */ + } + return changed; +} + +unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + + unsigned long old, tmp; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + +#ifdef PTE_ATOMIC_UPDATES + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + andi. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + or %1,%1,%7\n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) + : "cc" ); +#else + old = pmd_val(*pmdp); + *pmdp = __pmd((old & ~clr) | set); +#endif + if (old & _PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp); + return old; +} + +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (pmd_trans_huge(*pmdp)) { + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + } else { + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + kick_all_cpus_sync(); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + } + return pmd; +} + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} + +/* + * We currently remove entries from the hashtable regardless of whether + * the entry was young or dirty. The generic routines only flush if the + * entry was young or dirty which is not good enough. + * + * We should be more intelligent about this but for the moment we override + * these functions and force a tlb flush unconditionally + */ +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} + +/* + * We mark the pmd splitting and invalidate all the hpte + * entries for this hugepage. + */ +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + unsigned long old, tmp; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + +#ifdef PTE_ATOMIC_UPDATES + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + andi. %1,%0,%6\n\ + bne- 1b \n\ + ori %1,%0,%4 \n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY) + : "cc" ); +#else + old = pmd_val(*pmdp); + *pmdp = __pmd(old | _PAGE_SPLITTING); +#endif + /* + * If we didn't had the splitting flag set, go and flush the + * HPTE entries. + */ + if (!(old & _PAGE_SPLITTING)) { + /* We need to flush the hpte */ + if (old & _PAGE_HASHPTE) + hpte_do_hugepage_flush(vma->vm_mm, address, pmdp); + } + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + kick_all_cpus_sync(); +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + assert_spin_locked(&mm->page_table_lock); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(&mm->page_table_lock); + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT); + assert_spin_locked(&mm->page_table_lock); + WARN_ON(!pmd_trans_huge(pmd)); +#endif + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} + +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + int ssize, i; + unsigned long s_addr; + int max_hpte_count; + unsigned int psize, valid; + unsigned char *hpte_slot_array; + unsigned long hidx, vpn, vsid, hash, shift, slot; + + /* + * Flush all the hptes mapping this hugepage + */ + s_addr = addr & HPAGE_PMD_MASK; + hpte_slot_array = get_hpte_slot_array(pmdp); + /* + * IF we try to do a HUGE PTE update after a withdraw is done. + * we will find the below NULL. This happens when we do + * split_huge_page_pmd + */ + if (!hpte_slot_array) + return; + + /* get the base page size */ + psize = get_slice_psize(mm, s_addr); + + if (ppc_md.hugepage_invalidate) + return ppc_md.hugepage_invalidate(mm, hpte_slot_array, + s_addr, psize); + /* + * No bluk hpte removal support, invalidate each entry + */ + shift = mmu_psize_defs[psize].shift; + max_hpte_count = HPAGE_PMD_SIZE >> shift; + for (i = 0; i < max_hpte_count; i++) { + /* + * 8 bits per each hpte entries + * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] + */ + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + ppc_md.hpte_invalidate(slot, vpn, psize, + MMU_PAGE_16M, ssize, 0); + } +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + pmd_val(pmd) |= pgprot_val(pgprot); + return pmd; +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + pmd_t pmd; + /* + * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always + * set. We use this to check THP page at pmd level. + * leaf pte for huge page, bottom two bits != 00 + */ + pmd_val(pmd) = pfn << PTE_RPN_SHIFT; + pmd_val(pmd) |= _PAGE_THP_HUGE; + pmd = pmd_set_protbits(pmd, pgprot); + return pmd; +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + + pmd_val(pmd) &= _HPAGE_CHG_MASK; + pmd = pmd_set_protbits(pmd, newprot); + return pmd; +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + return; +} + +pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return old_pmd; +} + +int has_transparent_hugepage(void) +{ + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segement + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index 5c45d474cfc..11571e11883 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -26,11 +26,11 @@ #include <linux/mm.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/memblock.h> #include <asm/prom.h> #include <asm/mmu.h> #include <asm/machdep.h> -#include <asm/lmb.h> #include "mmu_decl.h" @@ -38,21 +38,18 @@ struct hash_pte *Hash, *Hash_end; unsigned long Hash_size, Hash_mask; unsigned long _SDR1; -union ubat { /* BAT register values to be loaded */ - struct ppc_bat bat; - u32 word[2]; -} BATS[8][2]; /* 8 pairs of IBAT, DBAT */ +struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ struct batrange { /* stores address ranges mapped by BATs */ unsigned long start; unsigned long limit; - unsigned long phys; + phys_addr_t phys; } bat_addrs[8]; /* * Return PA for this VA if it is mapped by a BAT, or 0 */ -unsigned long v_mapped_by_bats(unsigned long va) +phys_addr_t v_mapped_by_bats(unsigned long va) { int b; for (b = 0; b < 4; ++b) @@ -64,7 +61,7 @@ unsigned long v_mapped_by_bats(unsigned long va) /* * Return VA for a given PA or 0 if not mapped */ -unsigned long p_mapped_by_bats(unsigned long pa) +unsigned long p_mapped_by_bats(phys_addr_t pa) { int b; for (b = 0; b < 4; ++b) @@ -75,14 +72,10 @@ unsigned long p_mapped_by_bats(unsigned long pa) return 0; } -unsigned long __init mmu_mapin_ram(void) +unsigned long __init mmu_mapin_ram(unsigned long top) { -#ifdef CONFIG_POWER4 - return 0; -#else unsigned long tot, bl, done; unsigned long max_size = (256<<20); - unsigned long align; if (__map_without_bats) { printk(KERN_DEBUG "RAM mapped without BATs\n"); @@ -93,32 +86,25 @@ unsigned long __init mmu_mapin_ram(void) /* Make sure we don't map a block larger than the smallest alignment of the physical address. */ - /* alignment of PPC_MEMSTART */ - align = ~(PPC_MEMSTART-1) & PPC_MEMSTART; - /* set BAT block size to MIN(max_size, align) */ - if (align && align < max_size) - max_size = align; - - tot = total_lowmem; + tot = top; for (bl = 128<<10; bl < max_size; bl <<= 1) { if (bl * 2 > tot) break; } - setbat(2, KERNELBASE, PPC_MEMSTART, bl, _PAGE_RAM); - done = (unsigned long)bat_addrs[2].limit - KERNELBASE + 1; + setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X); + done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1; if ((done < tot) && !bat_addrs[3].limit) { /* use BAT3 to cover a bit more */ tot -= done; for (bl = 128<<10; bl < max_size; bl <<= 1) if (bl * 2 > tot) break; - setbat(3, KERNELBASE+done, PPC_MEMSTART+done, bl, _PAGE_RAM); - done = (unsigned long)bat_addrs[3].limit - KERNELBASE + 1; + setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X); + done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1; } return done; -#endif } /* @@ -126,16 +112,16 @@ unsigned long __init mmu_mapin_ram(void) * The parameters are not checked; in particular size must be a power * of 2 between 128k and 256M. */ -void __init setbat(int index, unsigned long virt, unsigned long phys, +void __init setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, int flags) { unsigned int bl; int wimgxpp; - union ubat *bat = BATS[index]; + struct ppc_bat *bat = BATS[index]; - if (((flags & _PAGE_NO_CACHE) == 0) && - cpu_has_feature(CPU_FTR_NEED_COHERENT)) - flags |= _PAGE_COHERENT; + if ((flags & _PAGE_NO_CACHE) || + (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0)) + flags &= ~_PAGE_COHERENT; bl = (size >> 17) - 1; if (PVR_VER(mfspr(SPRN_PVR)) != 1) { @@ -144,15 +130,13 @@ void __init setbat(int index, unsigned long virt, unsigned long phys, wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT | _PAGE_GUARDED); wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; - bat[1].word[0] = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ - bat[1].word[1] = phys | wimgxpp; -#ifndef CONFIG_KGDB /* want user access for breakpoints */ + bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; if (flags & _PAGE_USER) -#endif - bat[1].bat.batu.vp = 1; + bat[1].batu |= 1; /* Vp = 1 */ if (flags & _PAGE_GUARDED) { /* G bit must be zero in IBATs */ - bat[0].word[0] = bat[0].word[1] = 0; + bat[0].batu = bat[0].batl = 0; } else { /* make IBAT same as DBAT */ bat[0] = bat[1]; @@ -165,8 +149,8 @@ void __init setbat(int index, unsigned long virt, unsigned long phys, | _PAGE_COHERENT); wimgxpp |= (flags & _PAGE_RW)? ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX; - bat->word[0] = virt | wimgxpp | 4; /* Ks=0, Ku=1 */ - bat->word[1] = phys | bl | 0x40; /* V=1 */ + bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */ + bat->batl = phys | bl | 0x40; /* V=1 */ } bat_addrs[index].start = virt; @@ -202,7 +186,7 @@ void __init MMU_init_hw(void) extern unsigned int hash_page[]; extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[]; - if (!cpu_has_feature(CPU_FTR_HPTE_TABLE)) { + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) { /* * Put a blr (procedure return) instruction at the * start of hash_page, since we can still get DSI @@ -239,15 +223,14 @@ void __init MMU_init_hw(void) * Find some memory for the hash table. */ if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); - Hash = __va(lmb_alloc_base(Hash_size, Hash_size, - __initial_memory_limit)); + Hash = __va(memblock_alloc(Hash_size, Hash_size)); cacheable_memzero(Hash, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size); - printk("Total memory = %ldMB; using %ldkB for hash table (at %p)\n", - total_memory >> 20, Hash_size >> 10, Hash); + printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", + (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); /* @@ -288,3 +271,18 @@ void __init MMU_init_hw(void) if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205); } + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* 601 can only access 16MB at the moment */ + if (PVR_VER(mfspr(SPRN_PVR)) == 1) + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); + else /* Anything else has 256M mapped */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); +} diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 906daeda59a..0399a670295 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -2,7 +2,7 @@ * PowerPC64 SLB support. * * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM - * Based on earlier code writteh by: + * Based on earlier code written by: * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com * Copyright (c) 2001 Dave Engebretsen * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM @@ -14,8 +14,6 @@ * 2 of the License, or (at your option) any later version. */ -#undef DEBUG - #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> @@ -23,15 +21,10 @@ #include <asm/cputable.h> #include <asm/cacheflush.h> #include <asm/smp.h> -#include <asm/firmware.h> #include <linux/compiler.h> #include <asm/udbg.h> +#include <asm/code-patching.h> -#ifdef DEBUG -#define DBG(fmt...) udbg_printf(fmt) -#else -#define DBG(fmt...) -#endif extern void slb_allocate_realmode(unsigned long ea); extern void slb_allocate_user(unsigned long ea); @@ -44,13 +37,13 @@ static void slb_allocate(unsigned long ea) slb_allocate_realmode(ea); } +#define slb_esid_mask(ssize) \ + (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) + static inline unsigned long mk_esid_data(unsigned long ea, int ssize, unsigned long slot) { - unsigned long mask; - - mask = (ssize == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T; - return (ea & mask) | SLB_ESID_V | slot; + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot; } #define slb_vsid_shift(ssize) \ @@ -73,8 +66,10 @@ static inline void slb_shadow_update(unsigned long ea, int ssize, * we only update the current CPU's SLB shadow buffer. */ get_slb_shadow()->save_area[entry].esid = 0; - get_slb_shadow()->save_area[entry].vsid = mk_vsid_data(ea, ssize, flags); - get_slb_shadow()->save_area[entry].esid = mk_esid_data(ea, ssize, entry); + get_slb_shadow()->save_area[entry].vsid = + cpu_to_be64(mk_vsid_data(ea, ssize, flags)); + get_slb_shadow()->save_area[entry].esid = + cpu_to_be64(mk_esid_data(ea, ssize, entry)); } static inline void slb_shadow_clear(unsigned long entry) @@ -99,15 +94,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize, : "memory" ); } -void slb_flush_and_rebolt(void) +static void __slb_flush_and_rebolt(void) { /* If you change this make sure you change SLB_NUM_BOLTED - * appropriately too. */ + * and PR KVM appropriately too. */ unsigned long linear_llp, vmalloc_llp, lflags, vflags; unsigned long ksp_esid_data, ksp_vsid_data; - WARN_ON(!irqs_disabled()); - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; lflags = SLB_VSID_KERNEL | linear_llp; @@ -121,15 +114,10 @@ void slb_flush_and_rebolt(void) } else { /* Update stack entry; others don't change */ slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2); - ksp_vsid_data = get_slb_shadow()->save_area[2].vsid; + ksp_vsid_data = + be64_to_cpu(get_slb_shadow()->save_area[2].vsid); } - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - /* We need to do this all in asm, so we're sure we don't touch * the stack between the slbia and rebolting it. */ asm volatile("isync\n" @@ -146,6 +134,21 @@ void slb_flush_and_rebolt(void) : "memory"); } +void slb_flush_and_rebolt(void) +{ + + WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + + __slb_flush_and_rebolt(); + get_paca()->slb_cache_ptr = 0; +} + void slb_vmalloc_update(void) { unsigned long vflags; @@ -166,7 +169,7 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2) int esid_1t_count; /* System is not 1T segment size capable. */ - if (!cpu_has_feature(CPU_FTR_1T_SEGMENT)) + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) return (GET_ESID(addr1) == GET_ESID(addr2)); esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) + @@ -187,13 +190,21 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2) /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { - unsigned long offset = get_paca()->slb_cache_ptr; + unsigned long offset; unsigned long slbie_data = 0; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); - unsigned long unmapped_base; + unsigned long exec_base; - if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) && + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an SLB miss + * which would update the slb_cache/slb_cache_ptr fields in the PACA. + */ + hard_irq_disable(); + offset = get_paca()->slb_cache_ptr; + if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { int i; asm volatile("isync" : : : "memory"); @@ -207,7 +218,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) } asm volatile("isync" : : : "memory"); } else { - slb_flush_and_rebolt(); + __slb_flush_and_rebolt(); } /* Workaround POWER5 < DD2.1 issue */ @@ -219,40 +230,45 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) /* * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. */ - if (test_tsk_thread_flag(tsk, TIF_32BIT)) - unmapped_base = TASK_UNMAPPED_BASE_USER32; - else - unmapped_base = TASK_UNMAPPED_BASE_USER64; - - if (is_kernel_addr(pc)) - return; - slb_allocate(pc); + exec_base = 0x10000000; - if (esids_match(pc,stack)) + if (is_kernel_addr(pc) || is_kernel_addr(stack) || + is_kernel_addr(exec_base)) return; - if (is_kernel_addr(stack)) - return; - slb_allocate(stack); + slb_allocate(pc); - if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base)) - return; + if (!esids_match(pc, stack)) + slb_allocate(stack); - if (is_kernel_addr(unmapped_base)) - return; - slb_allocate(unmapped_base); + if (!esids_match(pc, exec_base) && + !esids_match(stack, exec_base)) + slb_allocate(exec_base); } static inline void patch_slb_encoding(unsigned int *insn_addr, unsigned int immed) { - /* Assume the instruction had a "0" immediate value, just - * "or" in the new value - */ - *insn_addr |= immed; - flush_icache_range((unsigned long)insn_addr, 4+ - (unsigned long)insn_addr); + int insn = (*insn_addr & 0xffff0000) | immed; + patch_instruction(insn_addr, insn); +} + +extern u32 slb_compare_rr_to_size[]; +extern u32 slb_miss_kernel_load_linear[]; +extern u32 slb_miss_kernel_load_io[]; +extern u32 slb_compare_rr_to_size[]; +extern u32 slb_miss_kernel_load_vmemmap[]; + +void slb_set_size(u16 size) +{ + if (mmu_slb_size == size) + return; + + mmu_slb_size = size; + patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); } void slb_initialize(void) @@ -260,16 +276,18 @@ void slb_initialize(void) unsigned long linear_llp, vmalloc_llp, io_llp; unsigned long lflags, vflags; static int slb_encoding_inited; - extern unsigned int *slb_miss_kernel_load_linear; - extern unsigned int *slb_miss_kernel_load_io; - extern unsigned int *slb_compare_rr_to_size; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + unsigned long vmemmap_llp; +#endif /* Prepare our SLB miss handler based on our page size */ linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; io_llp = mmu_psize_defs[mmu_io_psize].sllp; vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; - +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif if (!slb_encoding_inited) { slb_encoding_inited = 1; patch_slb_encoding(slb_miss_kernel_load_linear, @@ -279,17 +297,18 @@ void slb_initialize(void) patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); - DBG("SLB: linear LLP = %04x\n", linear_llp); - DBG("SLB: io LLP = %04x\n", io_llp); + pr_devel("SLB: linear LLP = %04lx\n", linear_llp); + pr_devel("SLB: io LLP = %04lx\n", io_llp); + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + patch_slb_encoding(slb_miss_kernel_load_vmemmap, + SLB_VSID_KERNEL | vmemmap_llp); + pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); +#endif } get_paca()->stab_rr = SLB_NUM_BOLTED; - /* On iSeries the bolted entries have already been set up by - * the hypervisor from the lparMap data in head.S */ - if (firmware_has_feature(FW_FEATURE_ISERIES)) - return; - lflags = SLB_VSID_KERNEL | linear_llp; vflags = SLB_VSID_KERNEL | vmalloc_llp; @@ -301,11 +320,16 @@ void slb_initialize(void) create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1); + /* For the boot cpu, we're running on the stack in init_thread_union, + * which is in the first segment of the linear mapping, and also + * get_paca()->kstack hasn't been initialized yet. + * For secondary cpus, we need to bolt the kernel stack entry now. + */ slb_shadow_clear(2); + if (raw_smp_processor_id() != boot_cpuid && + (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) + create_shadowed_slbe(get_paca()->kstack, + mmu_kernel_ssize, lflags, 2); - /* We don't bolt the stack for the time being - we're in boot, - * so the stack is in the bolted segment. By the time it goes - * elsewhere, we'll call _switch() which will bolt in the new - * one. */ asm volatile("isync":::"memory"); } diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 657f6b37e9d..736d18b3cef 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -31,10 +31,15 @@ * No other registers are examined or changed. */ _GLOBAL(slb_allocate_realmode) - /* r3 = faulting address */ + /* + * check for bad kernel/user address + * (ea & ~REGION_MASK) >= PGTABLE_RANGE + */ + rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4) + bne- 8f srdi r9,r3,60 /* get region */ - srdi r10,r3,28 /* get esid */ + srdi r10,r3,SID_SHIFT /* get esid */ cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ @@ -47,48 +52,68 @@ _GLOBAL(slb_allocate_realmode) * it to VSID 0, which is reserved as a bad VSID - one which * will never have any pages in it. */ - /* Check if hitting the linear mapping of the vmalloc/ioremap - * kernel space + /* Check if hitting the linear mapping or some other kernel space */ bne cr7,1f /* Linear mapping encoding bits, the "li" instruction below will * be patched by the kernel at boot */ -_GLOBAL(slb_miss_kernel_load_linear) +.globl slb_miss_kernel_load_linear +slb_miss_kernel_load_linear: li r11,0 + /* + * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * r9 = region id. + */ + addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha + addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l + + BEGIN_FTR_SECTION b slb_finish_load -END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT) +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) b slb_finish_load_1T -1: /* vmalloc/ioremap mapping encoding bits, the "li" instructions below - * will be patched by the kernel at boot +1: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* Check virtual memmap region. To be patches at kernel boot */ + cmpldi cr0,r9,0xf + bne 1f +.globl slb_miss_kernel_load_vmemmap +slb_miss_kernel_load_vmemmap: + li r11,0 + b 6f +1: +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + /* vmalloc mapping gets the encoding from the PACA as the mapping + * can be demoted from 64K -> 4K dynamically on some machines */ -BEGIN_FTR_SECTION - /* check whether this is in vmalloc or ioremap space */ clrldi r11,r10,48 cmpldi r11,(VMALLOC_SIZE >> 28) - 1 bgt 5f lhz r11,PACAVMALLOCSLLP(r13) b 6f 5: -END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE) -_GLOBAL(slb_miss_kernel_load_io) + /* IO mapping */ +.globl slb_miss_kernel_load_io +slb_miss_kernel_load_io: li r11,0 6: + /* + * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * r9 = region id. + */ + addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha + addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l + BEGIN_FTR_SECTION b slb_finish_load -END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT) +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) b slb_finish_load_1T -0: /* user address: proto-VSID = context << 15 | ESID. First check - * if the address is within the boundaries of the user region - */ - srdi. r9,r10,USER_ESID_BITS - bne- 8f /* invalid ea bits set */ - - +0: /* when using slices, we extract the psize off the slice bitmaps * and then we need to get the sllp encoding off the mmu_psize_defs * array. @@ -100,17 +125,31 @@ END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT) * between 4k and 64k standard page size */ #ifdef CONFIG_PPC_MM_SLICES + /* r10 have esid */ cmpldi r10,16 - - /* Get the slice index * 4 in r11 and matching slice size mask in r9 */ - ld r9,PACALOWSLICESPSIZE(r13) - sldi r11,r10,2 + /* below SLICE_LOW_TOP */ blt 5f - ld r9,PACAHIGHSLICEPSIZE(r13) - srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2) - andi. r11,r11,0x3c + /* + * Handle hpsizes, + * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index + */ + srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */ + addi r9,r11,PACAHIGHSLICEPSIZE + lbzx r9,r13,r9 /* r9 is hpsizes[r11] */ + /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */ + rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63 + b 6f -5: /* Extract the psize and multiply to get an array offset */ +5: + /* + * Handle lpsizes + * r9 is get_paca()->context.low_slices_psize, r11 is index + */ + ld r9,PACALOWSLICESPSIZE(r13) + mr r11,r10 +6: + sldi r11,r11,2 /* index * 4 */ + /* Extract the psize and multiply to get an array offset */ srd r9,r9,r11 andi. r9,r9,0xf mulli r9,r9,MMUPSIZEDEFSIZE @@ -130,15 +169,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT) ld r9,PACACONTEXTID(r13) BEGIN_FTR_SECTION cmpldi r10,0x1000 -END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) - rldimi r10,r9,USER_ESID_BITS,0 -BEGIN_FTR_SECTION bge slb_finish_load_1T -END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) b slb_finish_load 8: /* invalid EA */ li r10,0 /* BAD_VSID */ + li r9,0 /* BAD_VSID */ li r11,SLB_VSID_USER /* flags don't much matter */ b slb_finish_load @@ -187,8 +224,6 @@ _GLOBAL(slb_allocate_user) /* get context to calculate proto-VSID */ ld r9,PACACONTEXTID(r13) - rldimi r10,r9,USER_ESID_BITS,0 - /* fall through slb_finish_load */ #endif /* __DISABLED__ */ @@ -197,11 +232,16 @@ _GLOBAL(slb_allocate_user) /* * Finish loading of an SLB entry and return * - * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET + * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET */ slb_finish_load: + rldimi r10,r9,ESID_BITS,0 ASM_VSID_SCRAMBLE(r10,r9,256M) - rldimi r11,r10,SLB_VSID_SHIFT,16 /* combine VSID and flags */ + /* + * bits above VSID_BITS_256M need to be ignored from r10 + * also combine VSID and flags + */ + rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M)) /* r3 = EA, r11 = VSID data */ /* @@ -209,26 +249,12 @@ slb_finish_load: * free slot first but that took too long. Unfortunately we * dont have any LRU information to help us choose a slot. */ -#ifdef CONFIG_PPC_ISERIES -BEGIN_FW_FTR_SECTION - /* - * On iSeries, the "bolted" stack segment can be cast out on - * shared processor switch so we need to check for a miss on - * it and restore it to the right slot. - */ - ld r9,PACAKSAVE(r13) - clrrdi r9,r9,28 - clrrdi r3,r3,28 - li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */ - cmpld r9,r3 - beq 3f -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) -#endif /* CONFIG_PPC_ISERIES */ 7: ld r10,PACASTABRR(r13) addi r10,r10,1 /* This gets soft patched on boot. */ -_GLOBAL(slb_compare_rr_to_size) +.globl slb_compare_rr_to_size +slb_compare_rr_to_size: cmpldi r10,0 blt+ 4f @@ -259,10 +285,10 @@ _GLOBAL(slb_compare_rr_to_size) bge 1f /* still room in the slb cache */ - sldi r11,r3,1 /* r11 = offset * sizeof(u16) */ - rldicl r10,r10,36,28 /* get low 16 bits of the ESID */ - add r11,r11,r13 /* r11 = (u16 *)paca + offset */ - sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ + sldi r11,r3,2 /* r11 = offset * sizeof(u32) */ + srdi r10,r10,28 /* get the 36 bits of the ESID */ + add r11,r11,r13 /* r11 = (u32 *)paca + offset */ + stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ addi r3,r3,1 /* offset++ */ b 2f 1: /* offset >= SLB_CACHE_ENTRIES */ @@ -274,14 +300,18 @@ _GLOBAL(slb_compare_rr_to_size) /* * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. - * We assume legacy iSeries will never have 1T segments. * - * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9 + * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 */ slb_finish_load_1T: - srdi r10,r10,40-28 /* get 1T ESID */ + srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ + rldimi r10,r9,ESID_BITS_1T,0 ASM_VSID_SCRAMBLE(r10,r9,1T) - rldimi r11,r10,SLB_VSID_SHIFT_1T,16 /* combine VSID and flags */ + /* + * bits above VSID_BITS_1T need to be ignored from r10 + * also combine VSID and flags + */ + rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T)) li r10,MMU_SEGSIZE_1T rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index ad928edafb0..b0c75cc15ef 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -29,11 +29,16 @@ #include <linux/pagemap.h> #include <linux/err.h> #include <linux/spinlock.h> -#include <linux/module.h> +#include <linux/export.h> #include <asm/mman.h> #include <asm/mmu.h> #include <asm/spu.h> +/* some sanity checks */ +#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE +#error PGTABLE_RANGE exceeds slice_mask high_slices size +#endif + static DEFINE_SPINLOCK(slice_convert_lock); @@ -42,7 +47,7 @@ int _slice_debug = 1; static void slice_print_mask(const char *label, struct slice_mask mask) { - char *p, buf[16 + 3 + 16 + 1]; + char *p, buf[16 + 3 + 64 + 1]; int i; if (!_slice_debug) @@ -54,7 +59,7 @@ static void slice_print_mask(const char *label, struct slice_mask mask) *(p++) = '-'; *(p++) = ' '; for (i = 0; i < SLICE_NUM_HIGH; i++) - *(p++) = (mask.high_slices & (1 << i)) ? '1' : '0'; + *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0'; *(p++) = 0; printk(KERN_DEBUG "%s:%s\n", label, buf); @@ -84,8 +89,8 @@ static struct slice_mask slice_range_to_mask(unsigned long start, } if ((start + len) > SLICE_LOW_TOP) - ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1)) - - (1u << GET_HIGH_SLICE_INDEX(start)); + ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1)) + - (1ul << GET_HIGH_SLICE_INDEX(start)); return ret; } @@ -135,26 +140,31 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm) for (i = 0; i < SLICE_NUM_HIGH; i++) if (!slice_high_has_vma(mm, i)) - ret.high_slices |= 1u << i; + ret.high_slices |= 1ul << i; return ret; } static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) { + unsigned char *hpsizes; + int index, mask_index; struct slice_mask ret = { 0, 0 }; unsigned long i; - u64 psizes; + u64 lpsizes; - psizes = mm->context.low_slices_psize; + lpsizes = mm->context.low_slices_psize; for (i = 0; i < SLICE_NUM_LOW; i++) - if (((psizes >> (i * 4)) & 0xf) == psize) + if (((lpsizes >> (i * 4)) & 0xf) == psize) ret.low_slices |= 1u << i; - psizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) - if (((psizes >> (i * 4)) & 0xf) == psize) - ret.high_slices |= 1u << i; + hpsizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) { + mask_index = i & 0x1; + index = i >> 1; + if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) + ret.high_slices |= 1ul << i; + } return ret; } @@ -183,8 +193,10 @@ static void slice_flush_segments(void *parm) static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize) { + int index, mask_index; /* Write the new slice psize bits */ - u64 lpsizes, hpsizes; + unsigned char *hpsizes; + u64 lpsizes; unsigned long i, flags; slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); @@ -201,157 +213,136 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz lpsizes = (lpsizes & ~(0xful << (i * 4))) | (((unsigned long)psize) << (i * 4)); - hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) - if (mask.high_slices & (1u << i)) - hpsizes = (hpsizes & ~(0xful << (i * 4))) | - (((unsigned long)psize) << (i * 4)); - + /* Assign the value back */ mm->context.low_slices_psize = lpsizes; - mm->context.high_slices_psize = hpsizes; + + hpsizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) { + mask_index = i & 0x1; + index = i >> 1; + if (mask.high_slices & (1ul << i)) + hpsizes[index] = (hpsizes[index] & + ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } slice_dbg(" lsps=%lx, hsps=%lx\n", mm->context.low_slices_psize, mm->context.high_slices_psize); spin_unlock_irqrestore(&slice_convert_lock, flags); - mb(); - /* XXX this is sub-optimal but will do for now */ - on_each_cpu(slice_flush_segments, mm, 0, 1); #ifdef CONFIG_SPU_BASE spu_flush_all_slbs(mm); #endif } +/* + * Compute which slice addr is part of; + * set *boundary_addr to the start or end boundary of that slice + * (depending on 'end' parameter); + * return boolean indicating if the slice is marked as available in the + * 'available' slice_mark. + */ +static bool slice_scan_available(unsigned long addr, + struct slice_mask available, + int end, + unsigned long *boundary_addr) +{ + unsigned long slice; + if (addr < SLICE_LOW_TOP) { + slice = GET_LOW_SLICE_INDEX(addr); + *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; + return !!(available.low_slices & (1u << slice)); + } else { + slice = GET_HIGH_SLICE_INDEX(addr); + *boundary_addr = (slice + end) ? + ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; + return !!(available.high_slices & (1ul << slice)); + } +} + static unsigned long slice_find_area_bottomup(struct mm_struct *mm, unsigned long len, struct slice_mask available, - int psize, int use_cache) + int psize) { - struct vm_area_struct *vma; - unsigned long start_addr, addr; - struct slice_mask mask; int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); - - if (use_cache) { - if (len <= mm->cached_hole_size) { - start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } else - start_addr = addr = mm->free_area_cache; - } else - start_addr = addr = TASK_UNMAPPED_BASE; - -full_search: - for (;;) { - addr = _ALIGN_UP(addr, 1ul << pshift); - if ((TASK_SIZE - len) < addr) - break; - vma = find_vma(mm, addr); - BUG_ON(vma && (addr >= vma->vm_end)); - - mask = slice_range_to_mask(addr, len); - if (!slice_check_fit(mask, available)) { - if (addr < SLICE_LOW_TOP) - addr = _ALIGN_UP(addr + 1, 1ul << SLICE_LOW_SHIFT); - else - addr = _ALIGN_UP(addr + 1, 1ul << SLICE_HIGH_SHIFT); + unsigned long addr, found, next_end; + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; + + addr = TASK_UNMAPPED_BASE; + while (addr < TASK_SIZE) { + info.low_limit = addr; + if (!slice_scan_available(addr, available, 1, &addr)) continue; + + next_slice: + /* + * At this point [info.low_limit; addr) covers + * available slices only and ends at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the next available slice. + */ + if (addr >= TASK_SIZE) + addr = TASK_SIZE; + else if (slice_scan_available(addr, available, 1, &next_end)) { + addr = next_end; + goto next_slice; } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - if (use_cache) - mm->free_area_cache = addr + len; - return addr; - } - if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - } + info.high_limit = addr; - /* Make sure we didn't miss any holes */ - if (use_cache && start_addr != TASK_UNMAPPED_BASE) { - start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - goto full_search; + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; } + return -ENOMEM; } static unsigned long slice_find_area_topdown(struct mm_struct *mm, unsigned long len, struct slice_mask available, - int psize, int use_cache) + int psize) { - struct vm_area_struct *vma; - unsigned long addr; - struct slice_mask mask; int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long addr, found, prev; + struct vm_unmapped_area_info info; - /* check if free_area_cache is useful for us */ - if (use_cache) { - if (len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = mm->mmap_base; - } - - /* either no address requested or can't fit in requested - * address hole - */ - addr = mm->free_area_cache; - - /* make sure it can fit in the remaining address space */ - if (addr > len) { - addr = _ALIGN_DOWN(addr - len, 1ul << pshift); - mask = slice_range_to_mask(addr, len); - if (slice_check_fit(mask, available) && - slice_area_is_free(mm, addr, len)) - /* remember the address as a hint for - * next time - */ - return (mm->free_area_cache = addr); - } - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; addr = mm->mmap_base; - while (addr > len) { - /* Go down by chunk size */ - addr = _ALIGN_DOWN(addr - len, 1ul << pshift); - - /* Check for hit with different page size */ - mask = slice_range_to_mask(addr, len); - if (!slice_check_fit(mask, available)) { - if (addr < SLICE_LOW_TOP) - addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT); - else if (addr < (1ul << SLICE_HIGH_SHIFT)) - addr = SLICE_LOW_TOP; - else - addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT); + while (addr > PAGE_SIZE) { + info.high_limit = addr; + if (!slice_scan_available(addr - 1, available, 0, &addr)) continue; - } + prev_slice: /* - * Lookup failure means no vma is above this address, - * else if new region fits below vma->vm_start, - * return with success: + * At this point [addr; info.high_limit) covers + * available slices only and starts at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the previous available slice. */ - vma = find_vma(mm, addr); - if (!vma || (addr + len) <= vma->vm_start) { - /* remember the address as a hint for next time */ - if (use_cache) - mm->free_area_cache = addr; - return addr; + if (addr < PAGE_SIZE) + addr = PAGE_SIZE; + else if (slice_scan_available(addr - 1, available, 0, &prev)) { + addr = prev; + goto prev_slice; } + info.low_limit = addr; - /* remember the largest hole we saw so far */ - if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start; + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; } /* @@ -360,48 +351,55 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * can happen with large stack limits and large mmap() * allocations. */ - addr = slice_find_area_bottomup(mm, len, available, psize, 0); - - /* - * Restore the topdown base: - */ - if (use_cache) { - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; - } - - return addr; + return slice_find_area_bottomup(mm, len, available, psize); } static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, struct slice_mask mask, int psize, - int topdown, int use_cache) + int topdown) { if (topdown) - return slice_find_area_topdown(mm, len, mask, psize, use_cache); + return slice_find_area_topdown(mm, len, mask, psize); else - return slice_find_area_bottomup(mm, len, mask, psize, use_cache); + return slice_find_area_bottomup(mm, len, mask, psize); } +#define or_mask(dst, src) do { \ + (dst).low_slices |= (src).low_slices; \ + (dst).high_slices |= (src).high_slices; \ +} while (0) + +#define andnot_mask(dst, src) do { \ + (dst).low_slices &= ~(src).low_slices; \ + (dst).high_slices &= ~(src).high_slices; \ +} while (0) + +#ifdef CONFIG_PPC_64K_PAGES +#define MMU_PAGE_BASE MMU_PAGE_64K +#else +#define MMU_PAGE_BASE MMU_PAGE_4K +#endif + unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, unsigned long flags, unsigned int psize, - int topdown, int use_cache) + int topdown) { - struct slice_mask mask; + struct slice_mask mask = {0, 0}; struct slice_mask good_mask; struct slice_mask potential_mask = {0,0} /* silence stupid warning */; - int pmask_set = 0; + struct slice_mask compat_mask = {0, 0}; int fixed = (flags & MAP_FIXED); int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); struct mm_struct *mm = current->mm; + unsigned long newaddr; /* Sanity checks */ BUG_ON(mm->task_size == 0); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); - slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n", - addr, len, flags, topdown, use_cache); + slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", + addr, len, flags, topdown); if (len > mm->task_size) return -ENOMEM; @@ -410,27 +408,54 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, if (fixed && (addr & ((1ul << pshift) - 1))) return -EINVAL; if (fixed && addr > (mm->task_size - len)) - return -EINVAL; + return -ENOMEM; /* If hint, make sure it matches our alignment restrictions */ if (!fixed && addr) { addr = _ALIGN_UP(addr, 1ul << pshift); slice_dbg(" aligned addr=%lx\n", addr); + /* Ignore hint if it's too large or overlaps a VMA */ + if (addr > mm->task_size - len || + !slice_area_is_free(mm, addr, len)) + addr = 0; } - /* First makeup a "good" mask of slices that have the right size + /* First make up a "good" mask of slices that have the right size * already */ good_mask = slice_mask_for_size(mm, psize); slice_print_mask(" good_mask", good_mask); - /* First check hint if it's valid or if we have MAP_FIXED */ - if ((addr != 0 || fixed) && (mm->task_size - len) >= addr) { + /* + * Here "good" means slices that are already the right page size, + * "compat" means slices that have a compatible page size (i.e. + * 4k in a 64k pagesize kernel), and "free" means slices without + * any VMAs. + * + * If MAP_FIXED: + * check if fits in good | compat => OK + * check if fits in good | compat | free => convert free + * else bad + * If have hint: + * check if hint fits in good => OK + * check if hint fits in good | free => convert free + * Otherwise: + * search in good, found => OK + * search in good | free, found => convert free + * search in good | compat | free, found => convert free. + */ - /* Don't bother with hint if it overlaps a VMA */ - if (!fixed && !slice_area_is_free(mm, addr, len)) - goto search; +#ifdef CONFIG_PPC_64K_PAGES + /* If we support combo pages, we can allow 64k pages in 4k slices */ + if (psize == MMU_PAGE_64K) { + compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); + if (fixed) + or_mask(good_mask, compat_mask); + } +#endif + /* First check hint if it's valid or if we have MAP_FIXED */ + if (addr != 0 || fixed) { /* Build a mask for the requested range */ mask = slice_range_to_mask(addr, len); slice_print_mask(" mask", mask); @@ -442,54 +467,63 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, slice_dbg(" fits good !\n"); return addr; } - - /* We don't fit in the good mask, check what other slices are - * empty and thus can be converted + } else { + /* Now let's see if we can find something in the existing + * slices for that size */ - potential_mask = slice_mask_for_free(mm); - potential_mask.low_slices |= good_mask.low_slices; - potential_mask.high_slices |= good_mask.high_slices; - pmask_set = 1; - slice_print_mask(" potential", potential_mask); - if (slice_check_fit(mask, potential_mask)) { - slice_dbg(" fits potential !\n"); - goto convert; + newaddr = slice_find_area(mm, len, good_mask, psize, topdown); + if (newaddr != -ENOMEM) { + /* Found within the good mask, we don't have to setup, + * we thus return directly + */ + slice_dbg(" found area at 0x%lx\n", newaddr); + return newaddr; } } - /* If we have MAP_FIXED and failed the above step, then error out */ + /* We don't fit in the good mask, check what other slices are + * empty and thus can be converted + */ + potential_mask = slice_mask_for_free(mm); + or_mask(potential_mask, good_mask); + slice_print_mask(" potential", potential_mask); + + if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) { + slice_dbg(" fits potential !\n"); + goto convert; + } + + /* If we have MAP_FIXED and failed the above steps, then error out */ if (fixed) return -EBUSY; - search: slice_dbg(" search...\n"); - /* Now let's see if we can find something in the existing slices - * for that size + /* If we had a hint that didn't work out, see if we can fit + * anywhere in the good area. */ - addr = slice_find_area(mm, len, good_mask, psize, topdown, use_cache); - if (addr != -ENOMEM) { - /* Found within the good mask, we don't have to setup, - * we thus return directly - */ - slice_dbg(" found area at 0x%lx\n", addr); - return addr; - } - - /* Won't fit, check what can be converted */ - if (!pmask_set) { - potential_mask = slice_mask_for_free(mm); - potential_mask.low_slices |= good_mask.low_slices; - potential_mask.high_slices |= good_mask.high_slices; - pmask_set = 1; - slice_print_mask(" potential", potential_mask); + if (addr) { + addr = slice_find_area(mm, len, good_mask, psize, topdown); + if (addr != -ENOMEM) { + slice_dbg(" found area at 0x%lx\n", addr); + return addr; + } } /* Now let's see if we can find something in the existing slices - * for that size + * for that size plus free slices */ - addr = slice_find_area(mm, len, potential_mask, psize, topdown, - use_cache); + addr = slice_find_area(mm, len, potential_mask, psize, topdown); + +#ifdef CONFIG_PPC_64K_PAGES + if (addr == -ENOMEM && psize == MMU_PAGE_64K) { + /* retry the search with 4k-page slices included */ + or_mask(potential_mask, compat_mask); + addr = slice_find_area(mm, len, potential_mask, psize, + topdown); + } +#endif + if (addr == -ENOMEM) return -ENOMEM; @@ -498,7 +532,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, slice_print_mask(" mask", mask); convert: - slice_convert(mm, mask, psize); + andnot_mask(mask, good_mask); + andnot_mask(mask, compat_mask); + if (mask.low_slices || mask.high_slices) { + slice_convert(mm, mask, psize); + if (psize > MMU_PAGE_BASE) + on_each_cpu(slice_flush_segments, mm, 1); + } return addr; } @@ -511,8 +551,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long flags) { return slice_get_unmapped_area(addr, len, flags, - current->mm->context.user_psize, - 0, 1); + current->mm->context.user_psize, 0); } unsigned long arch_get_unmapped_area_topdown(struct file *filp, @@ -522,24 +561,24 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long flags) { return slice_get_unmapped_area(addr0, len, flags, - current->mm->context.user_psize, - 1, 1); + current->mm->context.user_psize, 1); } unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) { - u64 psizes; - int index; + unsigned char *hpsizes; + int index, mask_index; if (addr < SLICE_LOW_TOP) { - psizes = mm->context.low_slices_psize; + u64 lpsizes; + lpsizes = mm->context.low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); - } else { - psizes = mm->context.high_slices_psize; - index = GET_HIGH_SLICE_INDEX(addr); + return (lpsizes >> (index * 4)) & 0xf; } - - return (psizes >> (index * 4)) & 0xf; + hpsizes = mm->context.high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + mask_index = index & 0x1; + return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf; } EXPORT_SYMBOL_GPL(get_slice_psize); @@ -559,7 +598,9 @@ EXPORT_SYMBOL_GPL(get_slice_psize); */ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) { - unsigned long flags, lpsizes, hpsizes; + int index, mask_index; + unsigned char *hpsizes; + unsigned long flags, lpsizes; unsigned int old_psize; int i; @@ -580,15 +621,21 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) if (((lpsizes >> (i * 4)) & 0xf) == old_psize) lpsizes = (lpsizes & ~(0xful << (i * 4))) | (((unsigned long)psize) << (i * 4)); + /* Assign the value back */ + mm->context.low_slices_psize = lpsizes; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) - if (((hpsizes >> (i * 4)) & 0xf) == old_psize) - hpsizes = (hpsizes & ~(0xful << (i * 4))) | - (((unsigned long)psize) << (i * 4)); + for (i = 0; i < SLICE_NUM_HIGH; i++) { + mask_index = i & 0x1; + index = i >> 1; + if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize) + hpsizes[index] = (hpsizes[index] & + ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + - mm->context.low_slices_psize = lpsizes; - mm->context.high_slices_psize = hpsizes; slice_dbg(" lsps=%lx, hsps=%lx\n", mm->context.low_slices_psize, @@ -598,8 +645,47 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) spin_unlock_irqrestore(&slice_convert_lock, flags); } +void slice_set_psize(struct mm_struct *mm, unsigned long address, + unsigned int psize) +{ + unsigned char *hpsizes; + unsigned long i, flags; + u64 *lpsizes; + + spin_lock_irqsave(&slice_convert_lock, flags); + if (address < SLICE_LOW_TOP) { + i = GET_LOW_SLICE_INDEX(address); + lpsizes = &mm->context.low_slices_psize; + *lpsizes = (*lpsizes & ~(0xful << (i * 4))) | + ((unsigned long) psize << (i * 4)); + } else { + int index, mask_index; + i = GET_HIGH_SLICE_INDEX(address); + hpsizes = mm->context.high_slices_psize; + mask_index = i & 0x1; + index = i >> 1; + hpsizes[index] = (hpsizes[index] & + ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + spin_unlock_irqrestore(&slice_convert_lock, flags); + +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif +} + +void slice_set_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long len, unsigned int psize) +{ + struct slice_mask mask = slice_range_to_mask(start, len); + + slice_convert(mm, mask, psize); +} + /* - * is_hugepage_only_range() is used by generic code to verify wether + * is_hugepage_only_range() is used by generic code to verify whether * a normal mmap mapping (non hugetlbfs) is valid on a given area. * * until the generic code provides a more generic hook and/or starts @@ -621,9 +707,18 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct slice_mask mask, available; + unsigned int psize = mm->context.user_psize; mask = slice_range_to_mask(addr, len); - available = slice_mask_for_size(mm, mm->context.user_psize); + available = slice_mask_for_size(mm, psize); +#ifdef CONFIG_PPC_64K_PAGES + /* We need to account for 4k slices too */ + if (psize == MMU_PAGE_64K) { + struct slice_mask compat_mask; + compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); + or_mask(available, compat_mask); + } +#endif #if 0 /* too verbose */ slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n", diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c index 50448d5de9d..3f8efa6f299 100644 --- a/arch/powerpc/mm/stab.c +++ b/arch/powerpc/mm/stab.c @@ -12,15 +12,14 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/memblock.h> + #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/paca.h> #include <asm/cputable.h> -#include <asm/lmb.h> -#include <asm/abs_addr.h> -#include <asm/firmware.h> -#include <asm/iseries/hv_call.h> +#include <asm/prom.h> struct stab_entry { unsigned long esid_data; @@ -28,8 +27,8 @@ struct stab_entry { }; #define NR_STAB_CACHE_ENTRIES 8 -DEFINE_PER_CPU(long, stab_cache_ptr); -DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); +static DEFINE_PER_CPU(long, stab_cache_ptr); +static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache); /* * Create a segment table entry for the given esid/vsid pair. @@ -162,7 +161,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm) { struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; struct stab_entry *ste; - unsigned long offset = __get_cpu_var(stab_cache_ptr); + unsigned long offset; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); unsigned long unmapped_base; @@ -170,6 +169,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm) /* Force previous translations to complete. DRENG */ asm volatile("isync" : : : "memory"); + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an STAB miss + * which would update the stab_cache/stab_cache_ptr per-cpu variables. + */ + hard_irq_disable(); + + offset = __get_cpu_var(stab_cache_ptr); if (offset <= NR_STAB_CACHE_ENTRIES) { int i; @@ -232,7 +240,7 @@ void __init stabs_alloc(void) { int cpu; - if (cpu_has_feature(CPU_FTR_SLB)) + if (mmu_has_feature(MMU_FTR_SLB)) return; for_each_possible_cpu(cpu) { @@ -241,16 +249,16 @@ void __init stabs_alloc(void) if (cpu == 0) continue; /* stab for CPU 0 is statically allocated */ - newstab = lmb_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE, + newstab = memblock_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE, 1<<SID_SHIFT); newstab = (unsigned long)__va(newstab); memset((void *)newstab, 0, HW_PAGE_SIZE); paca[cpu].stab_addr = newstab; - paca[cpu].stab_real = virt_to_abs(newstab); - printk(KERN_INFO "Segment table for CPU %d at 0x%lx " - "virtual, 0x%lx absolute\n", + paca[cpu].stab_real = __pa(newstab); + printk(KERN_INFO "Segment table for CPU %d at 0x%llx " + "virtual, 0x%llx absolute\n", cpu, paca[cpu].stab_addr, paca[cpu].stab_real); } } @@ -274,12 +282,5 @@ void stab_initialize(unsigned long stab) /* Set ASR */ stabreal = get_paca()->stab_real | 0x1ul; -#ifdef CONFIG_PPC_ISERIES - if (firmware_has_feature(FW_FEATURE_ISERIES)) { - HvCall1(HvCallBaseSetASR, stabreal); - return; - } -#endif /* CONFIG_PPC_ISERIES */ - mtspr(SPRN_ASR, stabreal); } diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 4cafc0c33d0..6c0b1f5f8d2 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -10,7 +10,6 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/gfp.h> -#include <linux/slab.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/hugetlb.h> @@ -24,9 +23,9 @@ * Also makes sure that the subpage_prot_table structure is * reinitialized for the next user. */ -void subpage_prot_free(pgd_t *pgd) +void subpage_prot_free(struct mm_struct *mm) { - struct subpage_prot_table *spt = pgd_subpage_prot(pgd); + struct subpage_prot_table *spt = &mm->context.spt; unsigned long i, j, addr; u32 **p; @@ -51,6 +50,13 @@ void subpage_prot_free(pgd_t *pgd) spt->maxaddr = 0; } +void subpage_prot_init_new_context(struct mm_struct *mm) +{ + struct subpage_prot_table *spt = &mm->context.spt; + + memset(spt, 0, sizeof(*spt)); +} + static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, int npages) { @@ -72,7 +78,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); for (; npages > 0; --npages) { - pte_update(mm, addr, pte, 0, 0); + pte_update(mm, addr, pte, 0, 0, 0); addr += PAGE_SIZE; ++pte; } @@ -87,9 +93,10 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, static void subpage_prot_clear(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); + struct subpage_prot_table *spt = &mm->context.spt; u32 **spm, *spp; - int i, nw; + unsigned long i; + size_t nw; unsigned long next, limit; down_write(&mm->mmap_sem); @@ -98,7 +105,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) limit = spt->maxaddr; for (; addr < limit; addr = next) { next = pmd_addr_end(addr, limit); - if (addr < 0x100000000) { + if (addr < 0x100000000UL) { spm = spt->low_prot; } else { spm = spt->protptrs[addr >> SBP_L3_SHIFT]; @@ -123,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) up_write(&mm->mmap_sem); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->private; + split_huge_page_pmd(vma, addr, pmd); + return 0; +} + +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + struct mm_walk subpage_proto_walk = { + .mm = mm, + .pmd_entry = subpage_walk_pmd_entry, + }; + + /* + * We don't try too hard, we just mark all the vma in that range + * VM_NOHUGEPAGE and split them. + */ + vma = find_vma(mm, addr); + /* + * If the range is in unmapped range, just return + */ + if (vma && ((addr + len) <= vma->vm_start)) + return; + + while (vma) { + if (vma->vm_start >= (addr + len)) + break; + vma->vm_flags |= VM_NOHUGEPAGE; + subpage_proto_walk.private = vma; + walk_page_range(vma->vm_start, vma->vm_end, + &subpage_proto_walk); + vma = vma->vm_next; + } +} +#else +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + return; +} +#endif + /* * Copy in a subpage protection map for an address range. * The map has 2 bits per 4k subpage, so 32 bits per 64k page. @@ -136,9 +190,10 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); + struct subpage_prot_table *spt = &mm->context.spt; u32 **spm, *spp; - int i, nw; + unsigned long i; + size_t nw; unsigned long next, limit; int err; @@ -160,10 +215,11 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) return -EFAULT; down_write(&mm->mmap_sem); + subpage_mark_vma_nohuge(mm, addr, len); for (limit = addr + len; addr < limit; addr = next) { next = pmd_addr_end(addr, limit); err = -ENOMEM; - if (addr < 0x100000000) { + if (addr < 0x100000000UL) { spm = spt->low_prot; } else { spm = spt->protptrs[addr >> SBP_L3_SHIFT]; diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_hash32.c index eb4b512d65f..558e30cce33 100644 --- a/arch/powerpc/mm/tlb_32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -27,6 +27,7 @@ #include <linux/init.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/export.h> #include <asm/tlbflush.h> #include <asm/tlb.h> @@ -45,6 +46,7 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr) flush_hash_pages(mm->context.id, addr, ptephys, 1); } } +EXPORT_SYMBOL(flush_hash_entry); /* * Called by ptep_set_access_flags, must flush on CPUs for which the @@ -85,17 +87,6 @@ void tlb_flush(struct mmu_gather *tlb) * -- Cort */ -/* - * 750 SMP is a Bad Idea because the 750 doesn't broadcast all - * the cache operations on the bus. Hence we need to use an IPI - * to get the other CPU(s) to invalidate their TLBs. - */ -#ifdef CONFIG_SMP_750 -#define FINISH_FLUSH smp_send_tlb_invalidate(0) -#else -#define FINISH_FLUSH do { } while (0) -#endif - static void flush_range(struct mm_struct *mm, unsigned long start, unsigned long end) { @@ -134,8 +125,8 @@ static void flush_range(struct mm_struct *mm, unsigned long start, void flush_tlb_kernel_range(unsigned long start, unsigned long end) { flush_range(&init_mm, start, end); - FINISH_FLUSH; } +EXPORT_SYMBOL(flush_tlb_kernel_range); /* * Flush all the (user) entries for the address space described by mm. @@ -157,8 +148,8 @@ void flush_tlb_mm(struct mm_struct *mm) */ for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); - FINISH_FLUSH; } +EXPORT_SYMBOL(flush_tlb_mm); void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { @@ -173,8 +164,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr); if (!pmd_none(*pmd)) flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); - FINISH_FLUSH; } +EXPORT_SYMBOL(flush_tlb_page); /* * For each address in the range, find the pte for the address @@ -185,5 +176,9 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { flush_range(vma->vm_mm, start, end); - FINISH_FLUSH; +} +EXPORT_SYMBOL(flush_tlb_range); + +void __init early_init_mmu(void) +{ } diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_hash64.c index e2d867ce1c7..c99f6510a0b 100644 --- a/arch/powerpc/mm/tlb_64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -23,7 +23,6 @@ #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/init.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <asm/pgalloc.h> @@ -33,102 +32,18 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); -/* This is declared as we are using the more or less generic - * include/asm-powerpc/tlb.h file -- tgall - */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); -unsigned long pte_freelist_forced_free; - -struct pte_freelist_batch -{ - struct rcu_head rcu; - unsigned int index; - pgtable_free_t tables[0]; -}; - -DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); -unsigned long pte_freelist_forced_free; - -#define PTE_FREELIST_SIZE \ - ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ - / sizeof(pgtable_free_t)) - -static void pte_free_smp_sync(void *arg) -{ - /* Do nothing, just ensure we sync with all CPUs */ -} - -/* This is only called when we are critically out of memory - * (and fail to get a page in pte_free_tlb). - */ -static void pgtable_free_now(pgtable_free_t pgf) -{ - pte_freelist_forced_free++; - - smp_call_function(pte_free_smp_sync, NULL, 0, 1); - - pgtable_free(pgf); -} - -static void pte_free_rcu_callback(struct rcu_head *head) -{ - struct pte_freelist_batch *batch = - container_of(head, struct pte_freelist_batch, rcu); - unsigned int i; - - for (i = 0; i < batch->index; i++) - pgtable_free(batch->tables[i]); - - free_page((unsigned long)batch); -} - -static void pte_free_submit(struct pte_freelist_batch *batch) -{ - INIT_RCU_HEAD(&batch->rcu); - call_rcu(&batch->rcu, pte_free_rcu_callback); -} - -void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) -{ - /* This is safe since tlb_gather_mmu has disabled preemption */ - cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); - struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); - - if (atomic_read(&tlb->mm->mm_users) < 2 || - cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { - pgtable_free(pgf); - return; - } - - if (*batchp == NULL) { - *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); - if (*batchp == NULL) { - pgtable_free_now(pgf); - return; - } - (*batchp)->index = 0; - } - (*batchp)->tables[(*batchp)->index++] = pgf; - if ((*batchp)->index == PTE_FREELIST_SIZE) { - pte_free_submit(*batchp); - *batchp = NULL; - } -} - /* * A linux PTE was changed and the corresponding hash table entry * neesd to be flushed. This function will either perform the flush * immediately or will batch it up if the current CPU has an active * batch on it. - * - * Must be called from within some kind of spinlock/non-preempt region... */ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge) { - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); - unsigned long vsid, vaddr; + unsigned long vpn; + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); + unsigned long vsid; unsigned int psize; int ssize; real_pte_t rpte; @@ -136,11 +51,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, i = batch->index; - /* We mask the address for the base page size. Huge pages will - * have applied their own masking already - */ - addr &= PAGE_MASK; - /* Get page size (maybe move back to caller). * * NOTE: when using special 64K mappings in 4K environment like @@ -150,24 +60,33 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, */ if (huge) { #ifdef CONFIG_HUGETLB_PAGE - psize = mmu_huge_psize; + psize = get_slice_psize(mm, addr); + /* Mask the address for the correct page size */ + addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); #else BUG(); psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */ #endif - } else + } else { psize = pte_pagesize_index(mm, addr, pte); + /* Mask the address for the standard page size. If we + * have a 64k page kernel, but the hardware does not + * support 64k pages, this might be different from the + * hardware page size encoded in the slice table. */ + addr &= PAGE_MASK; + } + /* Build full vaddr */ if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); } else { vsid = get_kernel_vsid(addr, mmu_kernel_ssize); ssize = mmu_kernel_ssize; } - vaddr = hpt_va(addr, vsid, ssize); + WARN_ON(vsid == 0); + vpn = hpt_vpn(addr, vsid, ssize); rpte = __real_pte(__pte(pte), ptep); /* @@ -177,7 +96,8 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, * and decide to use local invalidates instead... */ if (!batch->active) { - flush_hash_page(vaddr, rpte, psize, ssize, 0); + flush_hash_page(vpn, rpte, psize, ssize, 0); + put_cpu_var(ppc64_tlb_batch); return; } @@ -202,10 +122,11 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, batch->ssize = ssize; } batch->pte[i] = rpte; - batch->vaddr[i] = vaddr; + batch->vpn[i] = vpn; batch->index = ++i; if (i >= PPC64_TLB_BATCH_NR) __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); } /* @@ -217,30 +138,33 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, */ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) { - cpumask_t tmp; + const struct cpumask *tmp; int i, local = 0; i = batch->index; - tmp = cpumask_of_cpu(smp_processor_id()); - if (cpus_equal(batch->mm->cpu_vm_mask, tmp)) + tmp = cpumask_of(smp_processor_id()); + if (cpumask_equal(mm_cpumask(batch->mm), tmp)) local = 1; if (i == 1) - flush_hash_page(batch->vaddr[0], batch->pte[0], + flush_hash_page(batch->vpn[0], batch->pte[0], batch->psize, batch->ssize, local); else flush_hash_range(i, local); batch->index = 0; } -void pte_free_finish(void) +void tlb_flush(struct mmu_gather *tlb) { - /* This is safe since tlb_gather_mmu has disabled preemption */ - struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); - if (*batchp == NULL) - return; - pte_free_submit(*batchp); - *batchp = NULL; + /* If there's a TLB batch pending, then we must flush it because the + * pages are going to be freed and we really don't want to have a CPU + * access a freed page because it has a stale TLB + */ + if (tlbbatch->index) + __flush_tlb_pending(tlbbatch); + + put_cpu_var(ppc64_tlb_batch); } /** @@ -258,14 +182,13 @@ void pte_free_finish(void) * since 64K pages may overlap with other bridges when using 64K pages * with 4K HW pages on IO space. * - * Because of that usage pattern, it's only available with CONFIG_HOTPLUG - * and is implemented for small size rather than speed. + * Because of that usage pattern, it is implemented for small size rather + * than speed. */ -#ifdef CONFIG_HOTPLUG - void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, unsigned long end) { + int hugepage_shift; unsigned long flags; start = _ALIGN_DOWN(start, PAGE_SIZE); @@ -283,7 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_linux_pte(mm->pgd, start); + pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, + &hugepage_shift); unsigned long pte; if (ptep == NULL) @@ -291,10 +215,38 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, pte = pte_val(*ptep); if (!(pte & _PAGE_HASHPTE)) continue; - hpte_need_flush(mm, start, ptep, pte, 0); + if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) + hpte_do_hugepage_flush(mm, start, (pmd_t *)pte); + else + hpte_need_flush(mm, start, ptep, pte, 0); } arch_leave_lazy_mmu_mode(); local_irq_restore(flags); } -#endif /* CONFIG_HOTPLUG */ +void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + pte_t *start_pte; + unsigned long flags; + + addr = _ALIGN_DOWN(addr, PMD_SIZE); + /* Note: Normally, we should only ever use a batch within a + * PTE locked section. This violates the rule, but will work + * since we don't actually modify the PTEs, we just flush the + * hash while leaving the PTEs intact (including their reference + * to being hashed). This is not the most performance oriented + * way to do things but is fine for our needs here. + */ + local_irq_save(flags); + arch_enter_lazy_mmu_mode(); + start_pte = pte_offset_map(pmd, addr); + for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { + unsigned long pteval = pte_val(*pte); + if (pteval & _PAGE_HASHPTE) + hpte_need_flush(mm, addr, pte, pteval, 0); + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); + local_irq_restore(flags); +} diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S new file mode 100644 index 00000000000..356e8b41fb0 --- /dev/null +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -0,0 +1,1171 @@ +/* + * Low level TLB miss handlers for Book3E + * + * Copyright (C) 2008-2009 + * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/mmu.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cputable.h> +#include <asm/pgtable.h> +#include <asm/exception-64e.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_booke_hv_asm.h> + +#ifdef CONFIG_PPC_64K_PAGES +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) +#else +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) +#endif +#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) +#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) +#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) + +/********************************************************************** + * * + * TLB miss handling for Book3E with a bolted linear mapping * + * No virtual page table, no nested TLB misses * + * * + **********************************************************************/ + +/* + * Note that, unlike non-bolted handlers, TLB_EXFRAME is not + * modified by the TLB miss handlers themselves, since the TLB miss + * handler code will not itself cause a recursive TLB miss. + * + * TLB_EXFRAME will be modified when crit/mc/debug exceptions are + * entered/exited. + */ +.macro tlb_prolog_bolted intnum addr + mtspr SPRN_SPRG_GEN_SCRATCH,r12 + mfspr r12,SPRN_SPRG_TLB_EXFRAME + std r13,EX_TLB_R13(r12) + std r10,EX_TLB_R10(r12) + mfspr r13,SPRN_SPRG_PACA + + mfcr r10 + std r11,EX_TLB_R11(r12) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif + DO_KVM \intnum, SPRN_SRR1 + std r16,EX_TLB_R16(r12) + mfspr r16,\addr /* get faulting address */ + std r14,EX_TLB_R14(r12) + ld r14,PACAPGD(r13) + std r15,EX_TLB_R15(r12) + std r10,EX_TLB_CR(r12) + TLB_MISS_PROLOG_STATS +.endm + +.macro tlb_epilog_bolted + ld r14,EX_TLB_CR(r12) + ld r10,EX_TLB_R10(r12) + ld r11,EX_TLB_R11(r12) + ld r13,EX_TLB_R13(r12) + mtcr r14 + ld r14,EX_TLB_R14(r12) + ld r15,EX_TLB_R15(r12) + TLB_MISS_RESTORE_STATS + ld r16,EX_TLB_R16(r12) + mfspr r12,SPRN_SPRG_GEN_SCRATCH +.endm + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_bolted) + tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + + mfspr r11,SPRN_ESR + + srdi r15,r16,60 /* get region */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- dtlb_miss_fault_bolted /* Bail if fault addr is invalid */ + + rlwinm r10,r11,32-19,27,27 + rlwimi r10,r11,32-16,19,19 + cmpwi r15,0 /* user vs kernel check */ + ori r10,r10,_PAGE_PRESENT + oris r11,r10,_PAGE_ACCESSED@h + + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne tlb_miss_kernel_bolted + +tlb_miss_common_bolted: +/* + * This is the guts of the TLB miss handler for bolted-linear. + * We are entered with: + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = page table base + * r13 = PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ + rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 + cmpldi cr0,r14,0 + clrrdi r15,r15,3 + beq tlb_miss_fault_bolted /* No PGDIR, bail */ + +BEGIN_MMU_FTR_SECTION + /* Set the TLB reservation and search for existing entry. Then load + * the entry. + */ + PPC_TLBSRX_DOT(0,R16) + ldx r14,r14,r15 /* grab pgd entry */ + beq tlb_miss_done_bolted /* tlb exists already, bail */ +MMU_FTR_SECTION_ELSE + ldx r14,r14,r15 /* grab pgd entry */ +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) + +#ifndef CONFIG_PPC_64K_PAGES + rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */ + ldx r14,r14,r15 /* grab pud entry */ +#endif /* CONFIG_PPC_64K_PAGES */ + + rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted + ldx r14,r14,r15 /* Grab pmd entry */ + + rldicl r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted + ldx r14,r14,r15 /* Grab PTE, normal (!huge) page */ + + /* Check if required permissions are met */ + andc. r15,r11,r14 + rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + bne- tlb_miss_fault_bolted + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + clrldi r15,r15,12 /* Clear crap at the top */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + mtspr SPRN_MAS2,r11 + andi. r11,r14,_PAGE_DIRTY + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: + mtspr SPRN_MAS7_MAS3,r15 + tlbwe + +tlb_miss_done_bolted: + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + tlb_epilog_bolted + rfi + +itlb_miss_kernel_bolted: + li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h +tlb_miss_kernel_bolted: + mfspr r10,SPRN_MAS1 + ld r14,PACA_KERNELPGD(r13) + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ tlb_miss_common_bolted + +tlb_miss_fault_bolted: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX + bne itlb_miss_fault_bolted +dtlb_miss_fault_bolted: + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_data_storage_book3e +itlb_miss_fault_bolted: + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_instruction_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_bolted) + tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 + + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + srdi r15,r16,60 /* get region */ + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne- itlb_miss_fault_bolted + + li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + + cmpldi cr0,r15,0 /* Check for user region */ + oris r11,r11,_PAGE_ACCESSED@h + beq tlb_miss_common_bolted + b itlb_miss_kernel_bolted + +#ifdef CONFIG_PPC_FSL_BOOK3E +/* + * TLB miss handling for e6500 and derivatives, using hardware tablewalk. + * + * Linear mapping is bolted: no virtual page table or nested TLB misses + * Indirect entries in TLB1, hardware loads resulting direct entries + * into TLB0 + * No HES or NV hint on TLB1, so we need to do software round-robin + * No tlbsrx. so we need a spinlock, and we have to deal + * with MAS-damage caused by tlbsx + * 4K pages only + */ + + START_EXCEPTION(instruction_tlb_miss_e6500) + tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 + + ld r11,PACA_TCD_PTR(r13) + srdi. r15,r16,60 /* get region */ + ori r16,r16,1 + + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne tlb_miss_kernel_e6500 /* user/kernel test */ + + b tlb_miss_common_e6500 + + START_EXCEPTION(data_tlb_miss_e6500) + tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR + + ld r11,PACA_TCD_PTR(r13) + srdi. r15,r16,60 /* get region */ + rldicr r16,r16,0,62 + + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne tlb_miss_kernel_e6500 /* user vs kernel check */ + +/* + * This is the guts of the TLB miss handler for e6500 and derivatives. + * We are entered with: + * + * r16 = page of faulting address (low bit 0 if data, 1 if instruction) + * r15 = crap (free to use) + * r14 = page table base + * r13 = PACA + * r11 = tlb_per_core ptr + * r10 = cpu number + */ +tlb_miss_common_e6500: + /* + * Search if we already have an indirect entry for that virtual + * address, and if we do, bail out. + * + * MAS6:IND should be already set based on MAS4 + */ +1: lbarx r15,0,r11 + lhz r10,PACAPACAINDEX(r13) + cmpdi r15,0 + cmpdi cr1,r15,1 /* set cr1.eq = 0 for non-recursive */ + bne 2f + stbcx. r10,0,r11 + bne 1b +3: + .subsection 1 +2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */ + beq cr1,3b /* unlock will happen if cr1.eq = 0 */ + lbz r15,0(r11) + cmpdi r15,0 + bne 2b + b 1b + .previous + + mfspr r15,SPRN_MAS2 + + tlbsx 0,r16 + mfspr r10,SPRN_MAS1 + andis. r10,r10,MAS1_VALID@h + bne tlb_miss_done_e6500 + + /* Undo MAS-damage from the tlbsx */ + mfspr r10,SPRN_MAS1 + oris r10,r10,MAS1_VALID@h + mtspr SPRN_MAS1,r10 + mtspr SPRN_MAS2,r15 + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- tlb_miss_fault_e6500 + + rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 + cmpldi cr0,r14,0 + clrrdi r15,r15,3 + beq- tlb_miss_fault_e6500 /* No PGDIR, bail */ + ldx r14,r14,r15 /* grab pgd entry */ + + rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_e6500 /* Bad pgd entry or hugepage; bail */ + ldx r14,r14,r15 /* grab pud entry */ + + rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_e6500 + ldx r14,r14,r15 /* Grab pmd entry */ + + mfspr r10,SPRN_MAS0 + cmpdi cr0,r14,0 + bge tlb_miss_fault_e6500 + + /* Now we build the MAS for a 2M indirect page: + * + * MAS 0 : ESEL needs to be filled by software round-robin + * MAS 1 : Fully set up + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * - TID already cleared if necessary + * MAS 2 : Default not 2M-aligned, need to be redone + * MAS 3+7 : Needs to be done + */ + + ori r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) + mtspr SPRN_MAS7_MAS3,r14 + + clrrdi r15,r16,21 /* make EA 2M-aligned */ + mtspr SPRN_MAS2,r15 + + lbz r15,TCD_ESEL_NEXT(r11) + lbz r16,TCD_ESEL_MAX(r11) + lbz r14,TCD_ESEL_FIRST(r11) + rlwimi r10,r15,16,0x00ff0000 /* insert esel_next into MAS0 */ + addi r15,r15,1 /* increment esel_next */ + mtspr SPRN_MAS0,r10 + cmpw r15,r16 + iseleq r15,r14,r15 /* if next == last use first */ + stb r15,TCD_ESEL_NEXT(r11) + + tlbwe + +tlb_miss_done_e6500: + .macro tlb_unlock_e6500 + beq cr1,1f /* no unlock if lock was recursively grabbed */ + li r15,0 + isync + stb r15,0(r11) +1: + .endm + + tlb_unlock_e6500 + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + tlb_epilog_bolted + rfi + +tlb_miss_kernel_e6500: + mfspr r10,SPRN_MAS1 + ld r14,PACA_KERNELPGD(r13) + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ tlb_miss_common_e6500 + +tlb_miss_fault_e6500: + tlb_unlock_e6500 + /* We need to check if it was an instruction miss */ + andi. r16,r16,1 + bne itlb_miss_fault_e6500 +dtlb_miss_fault_e6500: + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_data_storage_book3e +itlb_miss_fault_e6500: + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_instruction_storage_book3e +#endif /* CONFIG_PPC_FSL_BOOK3E */ + +/********************************************************************** + * * + * TLB miss handling for Book3E with TLB reservation and HES support * + * * + **********************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* The page tables are mapped virtually linear. At this point, though, + * we don't know whether we are trying to fault in a first level + * virtual address or a virtual page table address. We can get that + * from bit 0x1 of the region ID which we have set for a page table + */ + andi. r10,r15,0x1 + bne- virt_page_table_tlb_miss + + std r14,EX_TLB_ESR(r12); /* save ESR */ + std r16,EX_TLB_DEAR(r12); /* save DEAR */ + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + li r11,_PAGE_PRESENT + oris r11,r11,_PAGE_ACCESSED@h + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r15,0 /* Check for user region */ + + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + rlwimi r11,r14,32-19,27,27 + rlwimi r11,r14,32-16,19,19 + beq normal_tlb_miss + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by writing a crazy value in ESR in our exception frame + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h + + cmpldi cr0,r15,0 /* Check for user region */ + std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ + beq normal_tlb_miss + + li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of the first-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = faulting address + * r15 = region ID + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ +normal_tlb_miss: + /* So we first construct the page table address. We do that by + * shifting the bottom of the address (not the region ID) by + * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and + * or'ing the fourth high bit. + * + * NOTE: For 64K pages, we do things slightly differently in + * order to handle the weird page table format used by linux + */ + ori r10,r15,0x1 +#ifdef CONFIG_PPC_64K_PAGES + /* For the top bits, 16 bytes per PTE */ + rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 + /* Now create the bottom bits as 0 in position 0x8000 and + * the rest calculated for 8 bytes per PTE + */ + rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 + /* Insert the bottom bits in */ + rlwimi r14,r15,0,16,31 +#else + rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 +#endif + sldi r15,r10,60 + clrrdi r14,r14,3 + or r10,r15,r14 + +BEGIN_MMU_FTR_SECTION + /* Set the TLB reservation and search for existing entry. Then load + * the entry. + */ + PPC_TLBSRX_DOT(0,R16) + ld r14,0(r10) + beq normal_tlb_miss_done +MMU_FTR_SECTION_ELSE + ld r14,0(r10) +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) + +finish_normal_tlb_miss: + /* Check if required permissions are met */ + andc. r15,r11,r14 + bne- normal_tlb_miss_access_fault + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + * + * TODO: mix up code below for better scheduling + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + mtspr SPRN_MAS2,r11 + + /* Check page size, if not standard, update MAS1 */ + rldicl r11,r14,64-8,64-8 +#ifdef CONFIG_PPC_64K_PAGES + cmpldi cr0,r11,BOOK3E_PAGESZ_64K +#else + cmpldi cr0,r11,BOOK3E_PAGESZ_4K +#endif + beq- 1f + mfspr r11,SPRN_MAS1 + rlwimi r11,r14,31,21,24 + rlwinm r11,r11,0,21,19 + mtspr SPRN_MAS1,r11 +1: + /* Move RPN in position */ + rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + clrldi r15,r11,12 /* Clear crap at the top */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + andi. r11,r14,_PAGE_DIRTY + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: +BEGIN_MMU_FTR_SECTION + srdi r16,r15,32 + mtspr SPRN_MAS3,r15 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r15 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +normal_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +normal_tlb_miss_access_fault: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_EXEC + bne 1f + ld r14,EX_TLB_DEAR(r12) + ld r15,EX_TLB_ESR(r12) + mtspr SPRN_DEAR,r14 + mtspr SPRN_ESR,r15 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = region (top 4 bits of address) + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * Note that this should only ever be called as a second level handler + * with the current scheme when using SW load. + * That means we can always get the original fault DEAR at + * EX_TLB_DEAR-EX_TLB_SIZE(r12) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will restart the whole fault at level + * 0 so we don't care too much about clobbers + * + * XXX That code was written back when we couldn't clobber r14. We can now, + * so we could probably optimize things a bit + */ +virt_page_table_tlb_miss: + /* Are we hitting a kernel page table ? */ + andi. r10,r15,0x8 + + /* The cool thing now is that r10 contains 0 for user and 8 for kernel, + * and we happen to have the swapper_pg_dir at offset 8 from the user + * pgdir in the PACA :-). + */ + add r11,r10,r13 + + /* If kernel, we need to clear MAS1 TID */ + beq 1f + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 +1: +BEGIN_MMU_FTR_SECTION + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + */ + PPC_TLBSRX_DOT(0,R16) + beq virt_page_table_tlb_miss_done +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + bne- virt_page_table_tlb_miss_fault + + /* Get the PGD pointer */ + ld r15,PACAPGD(r11) + cmpldi cr0,r15,0 + beq- virt_page_table_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault + + /* Ok, we're all right, we can now create a kernel translation for + * a 4K or 64K page from r16 -> r15. + */ + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + * + * So we only do MAS 2 and 3 for now... + */ + clrldi r11,r15,4 /* remove region ID from RPN */ + ori r10,r11,1 /* Or-in SR */ + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +BEGIN_MMU_FTR_SECTION +virt_page_table_tlb_miss_done: + + /* We have overriden MAS2:EPN but currently our primary TLB miss + * handler will always restore it so that should not be an issue, + * if we ever optimize the primary handler to not write MAS2 on + * some cases, we'll have to restore MAS2:EPN here based on the + * original fault's DEAR. If we do that we have to modify the + * ITLB miss handler to also store SRR0 in the exception frame + * as DEAR. + * + * However, one nasty thing we did is we cleared the reservation + * (well, potentially we did). We do a trick here thus if we + * are not a level 0 exception (we interrupted the TLB miss) we + * offset the return address by -4 in order to replay the tlbsrx + * instruction there + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- 1f + ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) + addi r10,r11,-4 + std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) +1: +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) + /* Return to caller, normal case */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); + TLB_MISS_EPILOG_SUCCESS + rfi + +virt_page_table_tlb_miss_fault: + /* If we fault here, things are a little bit tricky. We need to call + * either data or instruction store fault, and we need to retrieve + * the original fault address and ESR (for data). + * + * The thing is, we know that in normal circumstances, this is + * always called as a second level tlb miss for SW load or as a first + * level TLB miss for HW load, so we should be able to peek at the + * relevant information in the first exception frame in the PACA. + * + * However, we do need to double check that, because we may just hit + * a stray kernel pointer or a userland attack trying to hit those + * areas. If that is the case, we do a data fault. (We can't get here + * from an instruction tlb miss anyway). + * + * Note also that when going to a fault, we must unwind the previous + * level as well. Since we are doing that, we don't need to clear or + * restore the TLB reservation neither. + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- virt_page_table_tlb_miss_whacko_fault + + /* We dig the original DEAR and ESR from slot 0 */ + ld r15,EX_TLB_DEAR+PACA_EXTLB(r13) + ld r16,EX_TLB_ESR+PACA_EXTLB(r13) + + /* We check for the "special" ESR value for instruction faults */ + cmpdi cr0,r16,-1 + beq 1f + mtspr SPRN_DEAR,r15 + mtspr SPRN_ESR,r16 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +virt_page_table_tlb_miss_whacko_fault: + /* The linear fault will restart everything so ESR and DEAR will + * not have been clobbered, let's just fault with what we have + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + + +/************************************************************** + * * + * TLB miss handling for Book3E with hw page table support * + * * + **************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_htw) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_htw) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by keeping a crazy value for ESR in r14 + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = PGD pointer + * r14 = ESR + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will save/restore things for us + */ +htw_tlb_miss: + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + * + * MAS1:IND should be already set based on MAS4 + */ + PPC_TLBSRX_DOT(0,R16) + beq htw_tlb_miss_done + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- htw_tlb_miss_fault + + /* Get the PGD pointer */ + cmpldi cr0,r15,0 + beq- htw_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault + + /* Ok, we're all right, we can now create an indirect entry for + * a 1M or 256M page. + * + * The last trick is now that because we use "half" pages for + * the HTW (1M IND is 2K and 256M IND is 32K) we need to account + * for an added LSB bit to the RPN. For 64K pages, there is no + * problem as we already use 32K arrays (half PTE pages), but for + * 4K page we need to extract a bit from the virtual address and + * insert it into the "PA52" bit of the RPN. + */ +#ifndef CONFIG_PPC_64K_PAGES + rlwimi r15,r16,32-9,20,20 +#endif + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + */ +#ifdef CONFIG_PPC_64K_PAGES + ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) +#else + ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) +#endif + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +htw_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +htw_tlb_miss_fault: + /* We need to check if it was an instruction miss. We know this + * though because r14 would contain -1 + */ + cmpdi cr0,r14,-1 + beq 1f + mtspr SPRN_DEAR,r16 + mtspr SPRN_ESR,r14 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of "any" level TLB miss handler for kernel linear + * mapping misses. We are entered with: + * + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = ESR (data) or -1 (instruction) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * In addition we know that we will not re-enter, so in theory, we could + * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later. + * + * We also need to be careful about MAS registers here & TLB reservation, + * as we know we'll have clobbered them if we interrupt the main TLB miss + * handlers in which case we probably want to do a full restart at level + * 0 rather than saving / restoring the MAS. + * + * Note: If we care about performance of that core, we can easily shuffle + * a few things around + */ +tlb_load_linear: + /* For now, we assume the linear mapping is contiguous and stops at + * linear_map_top. We also assume the size is a multiple of 1G, thus + * we only use 1G pages for now. That might have to be changed in a + * final implementation, especially when dealing with hypervisors + */ + ld r11,PACATOC(r13) + ld r11,linear_map_top@got(r11) + ld r10,0(r11) + tovirt(10,10) + cmpld cr0,r16,r10 + bge tlb_load_linear_fault + + /* MAS1 need whole new setup. */ + li r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT) + oris r15,r15,MAS1_VALID@h /* MAS1 needs V and TSIZE */ + mtspr SPRN_MAS1,r15 + + /* Already somebody there ? */ + PPC_TLBSRX_DOT(0,R16) + beq tlb_load_linear_done + + /* Now we build the remaining MAS. MAS0 and 2 should be fine + * with their defaults, which leaves us with MAS 3 and 7. The + * mapping is linear, so we just take the address, clear the + * region bits, and or in the permission bits which are currently + * hard wired + */ + clrrdi r10,r16,30 /* 1G page index */ + clrldi r10,r10,4 /* clear region bits */ + ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +tlb_load_linear_done: + /* We use the "error" epilog for success as we do want to + * restore to the initial faulting context, whatever it was. + * We do that because we can't resume a fault within a TLB + * miss handler, due to MAS and TLB reservation being clobbered. + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR) + TLB_MISS_EPILOG_ERROR + rfi + +tlb_load_linear_fault: + /* We keep the DEAR and ESR around, this shouldn't have happened */ + cmpdi cr0,r14,-1 + beq 1f + TLB_MISS_EPILOG_ERROR_SPECIAL + b exc_data_storage_book3e +1: TLB_MISS_EPILOG_ERROR_SPECIAL + b exc_instruction_storage_book3e + + +#ifdef CONFIG_BOOK3E_MMU_TLB_STATS +.tlb_stat_inc: +1: ldarx r8,0,r9 + addi r8,r8,1 + stdcx. r8,0,r9 + bne- 1b + blr +#endif diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c new file mode 100644 index 00000000000..92cb18d52ea --- /dev/null +++ b/arch/powerpc/mm/tlb_nohash.c @@ -0,0 +1,732 @@ +/* + * This file contains the routines for TLB flushing. + * On machines where the MMU does not use a hash table to store virtual to + * physical translations (ie, SW loaded TLBs or Book3E compilant processors, + * this does -not- include 603 however which shares the implementation with + * hash based processors) + * + * -- BenH + * + * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org> + * IBM Corp. + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/preempt.h> +#include <linux/spinlock.h> +#include <linux/memblock.h> +#include <linux/of_fdt.h> +#include <linux/hugetlb.h> + +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/code-patching.h> +#include <asm/hugetlb.h> +#include <asm/paca.h> + +#include "mmu_decl.h" + +/* + * This struct lists the sw-supported page sizes. The hardawre MMU may support + * other sizes not listed here. The .ind field is only used on MMUs that have + * indirect page table entries. + */ +#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_PPC_FSL_BOOK3E +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_2M] = { + .shift = 21, + .enc = BOOK3E_PAGESZ_2M, + }, + [MMU_PAGE_4M] = { + .shift = 22, + .enc = BOOK3E_PAGESZ_4M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_64M] = { + .shift = 26, + .enc = BOOK3E_PAGESZ_64M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +#else +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .ind = 20, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_16K] = { + .shift = 14, + .enc = BOOK3E_PAGESZ_16K, + }, + [MMU_PAGE_64K] = { + .shift = 16, + .ind = 28, + .enc = BOOK3E_PAGESZ_64K, + }, + [MMU_PAGE_1M] = { + .shift = 20, + .enc = BOOK3E_PAGESZ_1M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .ind = 36, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +#endif /* CONFIG_FSL_BOOKE */ + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif /* CONFIG_PPC_BOOK3E_MMU */ + +/* The variables below are currently only used on 64-bit Book3E + * though this will probably be made common with other nohash + * implementations at some point + */ +#ifdef CONFIG_PPC64 + +int mmu_linear_psize; /* Page size used for the linear mapping */ +int mmu_pte_psize; /* Page size used for PTE pages */ +int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ +int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ +unsigned long linear_map_top; /* Top of linear mapping */ + + +/* + * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug + * exceptions. This is used for bolted and e6500 TLB miss handlers which + * do not modify this SPRG in the TLB miss code; for other TLB miss handlers, + * this is set to zero. + */ +int extlb_level_exc; + +#endif /* CONFIG_PPC64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E +/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ +DEFINE_PER_CPU(int, next_tlbcam_idx); +EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); +#endif + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ + +/* + * These are the base non-SMP variants of page and mm flushing + */ +void local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbil_pid(pid); + preempt_enable(); +} +EXPORT_SYMBOL(local_flush_tlb_mm); + +void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (pid != MMU_NO_CONTEXT) + _tlbil_va(vmaddr, pid, tsize, ind); + preempt_enable(); +} + +void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_tsize(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(local_flush_tlb_page); + +/* + * And here are the SMP non-local implementations + */ +#ifdef CONFIG_SMP + +static DEFINE_RAW_SPINLOCK(tlbivax_lock); + +static int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_thread_cpumask(smp_processor_id())); +} + +struct tlb_flush_param { + unsigned long addr; + unsigned int pid; + unsigned int tsize; + unsigned int ind; +}; + +static void do_flush_tlb_mm_ipi(void *param) +{ + struct tlb_flush_param *p = param; + + _tlbil_pid(p ? p->pid : 0); +} + +static void do_flush_tlb_page_ipi(void *param) +{ + struct tlb_flush_param *p = param; + + _tlbil_va(p->addr, p->pid, p->tsize, p->ind); +} + + +/* Note on invalidations and PID: + * + * We snapshot the PID with preempt disabled. At this point, it can still + * change either because: + * - our context is being stolen (PID -> NO_CONTEXT) on another CPU + * - we are invaliating some target that isn't currently running here + * and is concurrently acquiring a new PID on another CPU + * - some other CPU is re-acquiring a lost PID for this mm + * etc... + * + * However, this shouldn't be a problem as we only guarantee + * invalidation of TLB entries present prior to this call, so we + * don't care about the PID changing, and invalidating a stale PID + * is generally harmless. + */ + +void flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + if (!mm_is_core_local(mm)) { + struct tlb_flush_param p = { .pid = pid }; + /* Ignores smp_processor_id() even if set. */ + smp_call_function_many(mm_cpumask(mm), + do_flush_tlb_mm_ipi, &p, 1); + } + _tlbil_pid(pid); + no_context: + preempt_enable(); +} +EXPORT_SYMBOL(flush_tlb_mm); + +void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) +{ + struct cpumask *cpu_mask; + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto bail; + cpu_mask = mm_cpumask(mm); + if (!mm_is_core_local(mm)) { + /* If broadcast tlbivax is supported, use it */ + if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { + int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); + if (lock) + raw_spin_lock(&tlbivax_lock); + _tlbivax_bcast(vmaddr, pid, tsize, ind); + if (lock) + raw_spin_unlock(&tlbivax_lock); + goto bail; + } else { + struct tlb_flush_param p = { + .pid = pid, + .addr = vmaddr, + .tsize = tsize, + .ind = ind, + }; + /* Ignores smp_processor_id() even if set in cpu_mask */ + smp_call_function_many(cpu_mask, + do_flush_tlb_page_ipi, &p, 1); + } + } + _tlbil_va(vmaddr, pid, tsize, ind); + bail: + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) + flush_hugetlb_page(vma, vmaddr); +#endif + + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_tsize(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(flush_tlb_page); + +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PPC_47x +void __init early_init_mmu_47x(void) +{ +#ifdef CONFIG_SMP + unsigned long root = of_get_flat_dt_root(); + if (of_get_flat_dt_prop(root, "cooperative-partition", NULL)) + mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); +#endif /* CONFIG_SMP */ +} +#endif /* CONFIG_PPC_47x */ + +/* + * Flush kernel TLB entries in the given range + */ +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ +#ifdef CONFIG_SMP + preempt_disable(); + smp_call_function(do_flush_tlb_mm_ipi, NULL, 1); + _tlbil_pid(0); + preempt_enable(); +#else + _tlbil_pid(0); +#endif +} +EXPORT_SYMBOL(flush_tlb_kernel_range); + +/* + * Currently, for range flushing, we just do a full mm flush. This should + * be optimized based on a threshold on the size of the range, since + * some implementation can stack multiple tlbivax before a tlbsync but + * for now, we keep it that way + */ +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ + flush_tlb_mm(vma->vm_mm); +} +EXPORT_SYMBOL(flush_tlb_range); + +void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); +} + +/* + * Below are functions specific to the 64-bit variant of Book3E though that + * may change in the future + */ + +#ifdef CONFIG_PPC64 + +/* + * Handling of virtual linear page tables or indirect TLB entries + * flushing when PTE pages are freed + */ +void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + int tsize = mmu_psize_defs[mmu_pte_psize].enc; + + if (book3e_htw_mode != PPC_HTW_NONE) { + unsigned long start = address & PMD_MASK; + unsigned long end = address + PMD_SIZE; + unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; + + /* This isn't the most optimal, ideally we would factor out the + * while preempt & CPU mask mucking around, or even the IPI but + * it will do for now + */ + while (start < end) { + __flush_tlb_page(tlb->mm, start, tsize, 1); + start += size; + } + } else { + unsigned long rmask = 0xf000000000000000ul; + unsigned long rid = (address & rmask) | 0x1000000000000000ul; + unsigned long vpte = address & ~rmask; + +#ifdef CONFIG_PPC_64K_PAGES + vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; +#else + vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; +#endif + vpte |= rid; + __flush_tlb_page(tlb->mm, vpte, tsize, 0); + } +} + +static void setup_page_sizes(void) +{ + unsigned int tlb0cfg; + unsigned int tlb0ps; + unsigned int eptcfg; + int i, psize; + +#ifdef CONFIG_PPC_FSL_BOOK3E + unsigned int mmucfg = mfspr(SPRN_MMUCFG); + int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); + + if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); + unsigned int min_pg, max_pg; + + min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; + max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def; + unsigned int shift; + + def = &mmu_psize_defs[psize]; + shift = def->shift; + + if (shift == 0 || shift & 1) + continue; + + /* adjust to be in terms of 4^shift Kb */ + shift = (shift - 10) >> 1; + + if ((shift >= min_pg) && (shift <= max_pg)) + def->flags |= MMU_PAGE_SIZE_DIRECT; + } + + goto out; + } + + if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { + u32 tlb1cfg, tlb1ps; + + tlb0cfg = mfspr(SPRN_TLB0CFG); + tlb1cfg = mfspr(SPRN_TLB1CFG); + tlb1ps = mfspr(SPRN_TLB1PS); + eptcfg = mfspr(SPRN_EPTCFG); + + if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT)) + book3e_htw_mode = PPC_HTW_E6500; + + /* + * We expect 4K subpage size and unrestricted indirect size. + * The lack of a restriction on indirect size is a Freescale + * extension, indicated by PSn = 0 but SPSn != 0. + */ + if (eptcfg != 2) + book3e_htw_mode = PPC_HTW_NONE; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (tlb1ps & (1U << (def->shift - 10))) { + def->flags |= MMU_PAGE_SIZE_DIRECT; + + if (book3e_htw_mode && psize == MMU_PAGE_2M) + def->flags |= MMU_PAGE_SIZE_INDIRECT; + } + } + + goto out; + } +#endif + + tlb0cfg = mfspr(SPRN_TLB0CFG); + tlb0ps = mfspr(SPRN_TLB0PS); + eptcfg = mfspr(SPRN_EPTCFG); + + /* Look for supported direct sizes */ + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (tlb0ps & (1U << (def->shift - 10))) + def->flags |= MMU_PAGE_SIZE_DIRECT; + } + + /* Indirect page sizes supported ? */ + if ((tlb0cfg & TLBnCFG_IND) == 0 || + (tlb0cfg & TLBnCFG_PT) == 0) + goto out; + + book3e_htw_mode = PPC_HTW_IBM; + + /* Now, we only deal with one IND page size for each + * direct size. Hopefully all implementations today are + * unambiguous, but we might want to be careful in the + * future. + */ + for (i = 0; i < 3; i++) { + unsigned int ps, sps; + + sps = eptcfg & 0x1f; + eptcfg >>= 5; + ps = eptcfg & 0x1f; + eptcfg >>= 5; + if (!ps || !sps) + continue; + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (ps == (def->shift - 10)) + def->flags |= MMU_PAGE_SIZE_INDIRECT; + if (sps == (def->shift - 10)) + def->ind = ps + 10; + } + } + +out: + /* Cleanup array and print summary */ + pr_info("MMU: Supported page sizes\n"); + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + const char *__page_type_names[] = { + "unsupported", + "direct", + "indirect", + "direct & indirect" + }; + if (def->flags == 0) { + def->shift = 0; + continue; + } + pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), + __page_type_names[def->flags & 0x3]); + } +} + +static void setup_mmu_htw(void) +{ + /* + * If we want to use HW tablewalk, enable it by patching the TLB miss + * handlers to branch to the one dedicated to it. + */ + + switch (book3e_htw_mode) { + case PPC_HTW_IBM: + patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); + patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); + break; +#ifdef CONFIG_PPC_FSL_BOOK3E + case PPC_HTW_E6500: + extlb_level_exc = EX_TLB_SIZE; + patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); + patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e); + break; +#endif + } + pr_info("MMU: Book3E HW tablewalk %s\n", + book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported"); +} + +/* + * Early initialization of the MMU TLB code + */ +static void __early_init_mmu(int boot_cpu) +{ + unsigned int mas4; + + /* XXX This will have to be decided at runtime, but right + * now our boot and TLB miss code hard wires it. Ideally + * we should find out a suitable page size and patch the + * TLB miss code (either that or use the PACA to store + * the value we want) + */ + mmu_linear_psize = MMU_PAGE_1G; + + /* XXX This should be decided at runtime based on supported + * page sizes in the TLB, but for now let's assume 16M is + * always there and a good fit (which it probably is) + * + * Freescale booke only supports 4K pages in TLB0, so use that. + */ + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) + mmu_vmemmap_psize = MMU_PAGE_4K; + else + mmu_vmemmap_psize = MMU_PAGE_16M; + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + if (boot_cpu) { + /* Look for supported page sizes */ + setup_page_sizes(); + + /* Look for HW tablewalk support */ + setup_mmu_htw(); + } + + /* Set MAS4 based on page table setting */ + + mas4 = 0x4 << MAS4_WIMGED_SHIFT; + switch (book3e_htw_mode) { + case PPC_HTW_E6500: + mas4 |= MAS4_INDD; + mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT; + mas4 |= MAS4_TLBSELD(1); + mmu_pte_psize = MMU_PAGE_2M; + break; + + case PPC_HTW_IBM: + mas4 |= MAS4_INDD; +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_256M; +#else + mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_1M; +#endif + break; + + case PPC_HTW_NONE: +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; +#else + mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; +#endif + mmu_pte_psize = mmu_virtual_psize; + break; + } + mtspr(SPRN_MAS4, mas4); + + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code + */ + linear_map_top = memblock_end_of_DRAM(); + +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + unsigned int num_cams; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + linear_map_top = map_mem_in_cams(linear_map_top, num_cams); + + /* limit memory so we dont have linear faults */ + memblock_enforce_memory_limit(linear_map_top); + + if (book3e_htw_mode == PPC_HTW_NONE) { + extlb_level_exc = EX_TLB_SIZE; + patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); + patch_exception(0x1e0, + exc_instruction_tlb_miss_bolted_book3e); + } + } +#endif + + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); + + memblock_set_current_limit(linear_map_top); +} + +void __init early_init_mmu(void) +{ + __early_init_mmu(1); +} + +void early_init_mmu_secondary(void) +{ + __early_init_mmu(0); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* On non-FSL Embedded 64-bit, we adjust the RMA size to match + * the bolted TLB entry. We know for now that only 1G + * entries are supported though that may eventually + * change. + * + * on FSL Embedded 64-bit, we adjust the RMA size to match the + * first bolted TLB entry size. We still limit max to 1G even if + * the TLB could cover more. This is due to what the early init + * code is setup to do. + * + * We crop it to the size of the first MEMBLOCK to + * avoid going over total available memory just in case... + */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + unsigned long linear_sz; + linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET, + first_memblock_base); + ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); + } else +#endif + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(first_memblock_base + ppc64_rma_size); +} +#else /* ! CONFIG_PPC64 */ +void __init early_init_mmu(void) +{ +#ifdef CONFIG_PPC_47x + early_init_mmu_47x(); +#endif +} +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S new file mode 100644 index 00000000000..43ff3c797fb --- /dev/null +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -0,0 +1,426 @@ +/* + * This file contains low-level functions for performing various + * types of TLB invalidations on various processors with no hash + * table. + * + * This file implements the following functions for all no-hash + * processors. Some aren't implemented for some variants. Some + * are inline in tlbflush.h + * + * - tlbil_va + * - tlbil_pid + * - tlbil_all + * - tlbivax_bcast + * + * Code mostly moved over from misc_32.S + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Partially rewritten by Cort Dougan (cort@cs.nmt.edu) + * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/mmu.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/processor.h> +#include <asm/bug.h> + +#if defined(CONFIG_40x) + +/* + * 40x implementation needs only tlbil_va + */ +_GLOBAL(__tlbil_va) + /* We run the search with interrupts disabled because we have to change + * the PID and I don't want to preempt when that happens. + */ + mfmsr r5 + mfspr r6,SPRN_PID + wrteei 0 + mtspr SPRN_PID,r4 + tlbsx. r3, 0, r3 + mtspr SPRN_PID,r6 + wrtee r5 + bne 1f + sync + /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is + * clear. Since 25 is the V bit in the TLB_TAG, loading this value + * will invalidate the TLB entry. */ + tlbwe r3, r3, TLB_TAG + isync +1: blr + +#elif defined(CONFIG_8xx) + +/* + * Nothing to do for 8xx, everything is inline + */ + +#elif defined(CONFIG_44x) /* Includes 47x */ + +/* + * 440 implementation uses tlbsx/we for tlbil_va and a full sweep + * of the TLB for everything else. + */ +_GLOBAL(__tlbil_va) + mfspr r5,SPRN_MMUCR + mfmsr r10 + + /* + * We write 16 bits of STID since 47x supports that much, we + * will never be passed out of bounds values on 440 (hopefully) + */ + rlwimi r5,r4,0,16,31 + + /* We have to run the search with interrupts disabled, otherwise + * an interrupt which causes a TLB miss can clobber the MMUCR + * between the mtspr and the tlbsx. + * + * Critical and Machine Check interrupts take care of saving + * and restoring MMUCR, so only normal interrupts have to be + * taken care of. + */ + wrteei 0 + mtspr SPRN_MMUCR,r5 + tlbsx. r6,0,r3 + bne 10f + sync +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) + /* On 440 There are only 64 TLB entries, so r3 < 64, which means bit + * 22, is clear. Since 22 is the V bit in the TLB_PAGEID, loading this + * value will invalidate the TLB entry. + */ + tlbwe r6,r6,PPC44x_TLB_PAGEID + isync +10: wrtee r10 + blr +2: +#ifdef CONFIG_PPC_47x + oris r7,r6,0x8000 /* specify way explicitely */ + clrrwi r4,r3,12 /* get an EPN for the hashing with V = 0 */ + ori r4,r4,PPC47x_TLBE_SIZE + tlbwe r4,r7,0 /* write it */ + isync + wrtee r10 + blr +#else /* CONFIG_PPC_47x */ +1: trap + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; +#endif /* !CONFIG_PPC_47x */ + +_GLOBAL(_tlbil_all) +_GLOBAL(_tlbil_pid) +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) + li r3,0 + sync + + /* Load high watermark */ + lis r4,tlb_44x_hwater@ha + lwz r5,tlb_44x_hwater@l(r4) + +1: tlbwe r3,r3,PPC44x_TLB_PAGEID + addi r3,r3,1 + cmpw 0,r3,r5 + ble 1b + + isync + blr +2: +#ifdef CONFIG_PPC_47x + /* 476 variant. There's not simple way to do this, hopefully we'll + * try to limit the amount of such full invalidates + */ + mfmsr r11 /* Interrupts off */ + wrteei 0 + li r3,-1 /* Current set */ + lis r10,tlb_47x_boltmap@h + ori r10,r10,tlb_47x_boltmap@l + lis r7,0x8000 /* Specify way explicitely */ + + b 9f /* For each set */ + +1: li r9,4 /* Number of ways */ + li r4,0 /* Current way */ + li r6,0 /* Default entry value 0 */ + andi. r0,r8,1 /* Check if way 0 is bolted */ + mtctr r9 /* Load way counter */ + bne- 3f /* Bolted, skip loading it */ + +2: /* For each way */ + or r5,r3,r4 /* Make way|index for tlbre */ + rlwimi r5,r5,16,8,15 /* Copy index into position */ + tlbre r6,r5,0 /* Read entry */ +3: addis r4,r4,0x2000 /* Next way */ + andi. r0,r6,PPC47x_TLB0_VALID /* Valid entry ? */ + beq 4f /* Nope, skip it */ + rlwimi r7,r5,0,1,2 /* Insert way number */ + rlwinm r6,r6,0,21,19 /* Clear V */ + tlbwe r6,r7,0 /* Write it */ +4: bdnz 2b /* Loop for each way */ + srwi r8,r8,1 /* Next boltmap bit */ +9: cmpwi cr1,r3,255 /* Last set done ? */ + addi r3,r3,1 /* Next set */ + beq cr1,1f /* End of loop */ + andi. r0,r3,0x1f /* Need to load a new boltmap word ? */ + bne 1b /* No, loop */ + lwz r8,0(r10) /* Load boltmap entry */ + addi r10,r10,4 /* Next word */ + b 1b /* Then loop */ +1: isync /* Sync shadows */ + wrtee r11 +#else /* CONFIG_PPC_47x */ +1: trap + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; +#endif /* !CONFIG_PPC_47x */ + blr + +#ifdef CONFIG_PPC_47x + +/* + * _tlbivax_bcast is only on 47x. We don't bother doing a runtime + * check though, it will blow up soon enough if we mistakenly try + * to use it on a 440. + */ +_GLOBAL(_tlbivax_bcast) + mfspr r5,SPRN_MMUCR + mfmsr r10 + rlwimi r5,r4,0,16,31 + wrteei 0 + mtspr SPRN_MMUCR,r5 + isync + PPC_TLBIVAX(0, R3) + isync + eieio + tlbsync +BEGIN_FTR_SECTION + b 1f +END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) + sync + wrtee r10 + blr +/* + * DD2 HW could hang if in instruction fetch happens before msync completes. + * Touch enough instruction cache lines to ensure cache hits + */ +1: mflr r9 + bl 2f +2: mflr r6 + li r7,32 + PPC_ICBT(0,R6,R7) /* touch next cache line */ + add r6,r6,r7 + PPC_ICBT(0,R6,R7) /* touch next cache line */ + add r6,r6,r7 + PPC_ICBT(0,R6,R7) /* touch next cache line */ + sync + nop + nop + nop + nop + nop + nop + nop + nop + mtlr r9 + wrtee r10 + blr +#endif /* CONFIG_PPC_47x */ + +#elif defined(CONFIG_FSL_BOOKE) +/* + * FSL BookE implementations. + * + * Since feature sections are using _SECTION_ELSE we need + * to have the larger code path before the _SECTION_ELSE + */ + +/* + * Flush MMU TLB on the local processor + */ +_GLOBAL(_tlbil_all) +BEGIN_MMU_FTR_SECTION + li r3,(MMUCSR0_TLBFI)@l + mtspr SPRN_MMUCSR0, r3 +1: + mfspr r3,SPRN_MMUCSR0 + andi. r3,r3,MMUCSR0_TLBFI@l + bne 1b +MMU_FTR_SECTION_ELSE + PPC_TLBILX_ALL(0,R0) +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) + msync + isync + blr + +_GLOBAL(_tlbil_pid) +BEGIN_MMU_FTR_SECTION + slwi r3,r3,16 + mfmsr r10 + wrteei 0 + mfspr r4,SPRN_MAS6 /* save MAS6 */ + mtspr SPRN_MAS6,r3 + PPC_TLBILX_PID(0,R0) + mtspr SPRN_MAS6,r4 /* restore MAS6 */ + wrtee r10 +MMU_FTR_SECTION_ELSE + li r3,(MMUCSR0_TLBFI)@l + mtspr SPRN_MMUCSR0, r3 +1: + mfspr r3,SPRN_MMUCSR0 + andi. r3,r3,MMUCSR0_TLBFI@l + bne 1b +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX) + msync + isync + blr + +/* + * Flush MMU TLB for a particular address, but only on the local processor + * (no broadcast) + */ +_GLOBAL(__tlbil_va) + mfmsr r10 + wrteei 0 + slwi r4,r4,16 + ori r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l + mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ +BEGIN_MMU_FTR_SECTION + tlbsx 0,r3 + mfspr r4,SPRN_MAS1 /* check valid */ + andis. r3,r4,MAS1_VALID@h + beq 1f + rlwinm r4,r4,0,1,31 + mtspr SPRN_MAS1,r4 + tlbwe +MMU_FTR_SECTION_ELSE + PPC_TLBILX_VA(0,R3) +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) + msync + isync +1: wrtee r10 + blr +#elif defined(CONFIG_PPC_BOOK3E) +/* + * New Book3E (>= 2.06) implementation + * + * Note: We may be able to get away without the interrupt masking stuff + * if we save/restore MAS6 on exceptions that might modify it + */ +_GLOBAL(_tlbil_pid) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,R0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_pid_noind) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + ori r4,r4,MAS6_SIND + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,R0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_all) + PPC_TLBILX_ALL(0,R0) + msync + isync + blr + +_GLOBAL(_tlbil_va) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBILX_VA(0,R3) + msync + isync + wrtee r10 + blr + +_GLOBAL(_tlbivax_bcast) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBIVAX(0,R3) + eieio + tlbsync + sync + wrtee r10 + blr + +_GLOBAL(set_context) +#ifdef CONFIG_BDI_SWITCH + /* Context switch the PTE pointer for the Abatron BDI2000. + * The PGDIR is the second parameter. + */ + lis r5, abatron_pteptrs@h + ori r5, r5, abatron_pteptrs@l + stw r4, 0x4(r5) +#endif + mtspr SPRN_PID,r3 + isync /* Force context change */ + blr +#else +#error Unsupported processor type ! +#endif + +#if defined(CONFIG_PPC_FSL_BOOK3E) +/* + * extern void loadcam_entry(unsigned int index) + * + * Load TLBCAM[index] entry in to the L2 CAM MMU + */ +_GLOBAL(loadcam_entry) + mflr r5 + LOAD_REG_ADDR_PIC(r4, TLBCAM) + mtlr r5 + mulli r5,r3,TLBCAM_SIZE + add r3,r5,r4 + lwz r4,TLBCAM_MAS0(r3) + mtspr SPRN_MAS0,r4 + lwz r4,TLBCAM_MAS1(r3) + mtspr SPRN_MAS1,r4 + PPC_LL r4,TLBCAM_MAS2(r3) + mtspr SPRN_MAS2,r4 + lwz r4,TLBCAM_MAS3(r3) + mtspr SPRN_MAS3,r4 +BEGIN_MMU_FTR_SECTION + lwz r4,TLBCAM_MAS7(r3) + mtspr SPRN_MAS7,r4 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) + isync + tlbwe + isync + blr +#endif |
