aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/40x_mmu.c41
-rw-r--r--arch/powerpc/mm/44x_mmu.c193
-rw-r--r--arch/powerpc/mm/Makefile37
-rw-r--r--arch/powerpc/mm/dma-noncoherent.c419
-rw-r--r--arch/powerpc/mm/fault.c323
-rw-r--r--arch/powerpc/mm/fsl_booke_mmu.c265
-rw-r--r--arch/powerpc/mm/gup.c235
-rw-r--r--arch/powerpc/mm/hash_low_32.S245
-rw-r--r--arch/powerpc/mm/hash_low_64.S431
-rw-r--r--arch/powerpc/mm/hash_native_64.c479
-rw-r--r--arch/powerpc/mm/hash_utils_64.c924
-rw-r--r--arch/powerpc/mm/highmem.c86
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c179
-rw-r--r--arch/powerpc/mm/hugetlbpage-book3e.c153
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c129
-rw-r--r--arch/powerpc/mm/hugetlbpage.c1246
-rw-r--r--arch/powerpc/mm/icswx.c292
-rw-r--r--arch/powerpc/mm/icswx.h68
-rw-r--r--arch/powerpc/mm/icswx_pid.c87
-rw-r--r--arch/powerpc/mm/init_32.c205
-rw-r--r--arch/powerpc/mm/init_64.c328
-rw-r--r--arch/powerpc/mm/lmb.c357
-rw-r--r--arch/powerpc/mm/mem.c487
-rw-r--r--arch/powerpc/mm/mmap.c58
-rw-r--r--arch/powerpc/mm/mmu_context_32.c84
-rw-r--r--arch/powerpc/mm/mmu_context_64.c70
-rw-r--r--arch/powerpc/mm/mmu_context_hash32.c119
-rw-r--r--arch/powerpc/mm/mmu_context_hash64.c146
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c449
-rw-r--r--arch/powerpc/mm/mmu_decl.h137
-rw-r--r--arch/powerpc/mm/numa.c1501
-rw-r--r--arch/powerpc/mm/pgtable.c236
-rw-r--r--arch/powerpc/mm/pgtable_32.c190
-rw-r--r--arch/powerpc/mm/pgtable_64.c692
-rw-r--r--arch/powerpc/mm/ppc_mmu_32.c84
-rw-r--r--arch/powerpc/mm/slb.c162
-rw-r--r--arch/powerpc/mm/slb_low.S150
-rw-r--r--arch/powerpc/mm/slice.c511
-rw-r--r--arch/powerpc/mm/stab.c39
-rw-r--r--arch/powerpc/mm/subpage-prot.c76
-rw-r--r--arch/powerpc/mm/tlb_hash32.c (renamed from arch/powerpc/mm/tlb_32.c)25
-rw-r--r--arch/powerpc/mm/tlb_hash64.c (renamed from arch/powerpc/mm/tlb_64.c)186
-rw-r--r--arch/powerpc/mm/tlb_low_64e.S1171
-rw-r--r--arch/powerpc/mm/tlb_nohash.c732
-rw-r--r--arch/powerpc/mm/tlb_nohash_low.S426
45 files changed, 11231 insertions, 3222 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index 3899ea97fbd..5810967511d 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -35,6 +35,7 @@
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/highmem.h>
+#include <linux/memblock.h>
#include <asm/pgalloc.h>
#include <asm/prom.h>
@@ -47,6 +48,7 @@
#include <asm/bootx.h>
#include <asm/machdep.h>
#include <asm/setup.h>
+
#include "mmu_decl.h"
extern int __map_without_ltlbs;
@@ -84,20 +86,20 @@ void __init MMU_init_hw(void)
* vectors and the kernel live in real-mode.
*/
- mtspr(SPRN_DCCR, 0xF0000000); /* 512 MB of data space at 0x0. */
- mtspr(SPRN_ICCR, 0xF0000000); /* 512 MB of instr. space at 0x0. */
+ mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */
+ mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */
}
#define LARGE_PAGE_SIZE_16M (1<<24)
#define LARGE_PAGE_SIZE_4M (1<<22)
-unsigned long __init mmu_mapin_ram(void)
+unsigned long __init mmu_mapin_ram(unsigned long top)
{
- unsigned long v, s;
+ unsigned long v, s, mapped;
phys_addr_t p;
v = KERNELBASE;
- p = PPC_MEMSTART;
+ p = 0;
s = total_lowmem;
if (__map_without_ltlbs)
@@ -105,7 +107,7 @@ unsigned long __init mmu_mapin_ram(void)
while (s >= LARGE_PAGE_SIZE_16M) {
pmd_t *pmdp;
- unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+ unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
pmd_val(*pmdp++) = val;
@@ -120,7 +122,7 @@ unsigned long __init mmu_mapin_ram(void)
while (s >= LARGE_PAGE_SIZE_4M) {
pmd_t *pmdp;
- unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+ unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
pmd_val(*pmdp) = val;
@@ -130,5 +132,28 @@ unsigned long __init mmu_mapin_ram(void)
s -= LARGE_PAGE_SIZE_4M;
}
- return total_lowmem - s;
+ mapped = total_lowmem - s;
+
+ /* If the size of RAM is not an exact power of two, we may not
+ * have covered RAM in its entirety with 16 and 4 MiB
+ * pages. Consequently, restrict the top end of RAM currently
+ * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
+ * coverage with normal-sized pages (or other reasons) do not
+ * attempt to allocate outside the allowed range.
+ */
+ memblock_set_current_limit(mapped);
+
+ return mapped;
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ /* 40x can only access 16MB at the moment (see head_40x.S) */
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
}
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 04dc08798d3..82b1ff759e2 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -24,9 +24,11 @@
*/
#include <linux/init.h>
+#include <linux/memblock.h>
+
#include <asm/mmu.h>
-#include <asm/system.h>
#include <asm/page.h>
+#include <asm/cacheflush.h>
#include "mmu_decl.h"
@@ -37,11 +39,39 @@ unsigned int tlb_44x_index; /* = 0 */
unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS;
int icache_44x_need_flush;
+unsigned long tlb_47x_boltmap[1024/8];
+
+static void ppc44x_update_tlb_hwater(void)
+{
+ extern unsigned int tlb_44x_patch_hwater_D[];
+ extern unsigned int tlb_44x_patch_hwater_I[];
+
+ /* The TLB miss handlers hard codes the watermark in a cmpli
+ * instruction to improve performances rather than loading it
+ * from the global variable. Thus, we patch the instructions
+ * in the 2 TLB miss handlers when updating the value
+ */
+ tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) |
+ tlb_44x_hwater;
+ flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0],
+ (unsigned long)&tlb_44x_patch_hwater_D[1]);
+ tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) |
+ tlb_44x_hwater;
+ flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0],
+ (unsigned long)&tlb_44x_patch_hwater_I[1]);
+}
+
/*
- * "Pins" a 256MB TLB entry in AS0 for kernel lowmem
+ * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 44x type MMU
*/
static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys)
{
+ unsigned int entry = tlb_44x_hwater--;
+
+ ppc44x_update_tlb_hwater();
+
+ mtspr(SPRN_MMUCR, 0);
+
__asm__ __volatile__(
"tlbwe %2,%3,%4\n"
"tlbwe %1,%3,%5\n"
@@ -50,26 +80,175 @@ static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys)
: "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G),
"r" (phys),
"r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M),
- "r" (tlb_44x_hwater--), /* slot for this TLB entry */
+ "r" (entry),
"i" (PPC44x_TLB_PAGEID),
"i" (PPC44x_TLB_XLAT),
"i" (PPC44x_TLB_ATTRIB));
}
+static int __init ppc47x_find_free_bolted(void)
+{
+ unsigned int mmube0 = mfspr(SPRN_MMUBE0);
+ unsigned int mmube1 = mfspr(SPRN_MMUBE1);
+
+ if (!(mmube0 & MMUBE0_VBE0))
+ return 0;
+ if (!(mmube0 & MMUBE0_VBE1))
+ return 1;
+ if (!(mmube0 & MMUBE0_VBE2))
+ return 2;
+ if (!(mmube1 & MMUBE1_VBE3))
+ return 3;
+ if (!(mmube1 & MMUBE1_VBE4))
+ return 4;
+ if (!(mmube1 & MMUBE1_VBE5))
+ return 5;
+ return -1;
+}
+
+static void __init ppc47x_update_boltmap(void)
+{
+ unsigned int mmube0 = mfspr(SPRN_MMUBE0);
+ unsigned int mmube1 = mfspr(SPRN_MMUBE1);
+
+ if (mmube0 & MMUBE0_VBE0)
+ __set_bit((mmube0 >> MMUBE0_IBE0_SHIFT) & 0xff,
+ tlb_47x_boltmap);
+ if (mmube0 & MMUBE0_VBE1)
+ __set_bit((mmube0 >> MMUBE0_IBE1_SHIFT) & 0xff,
+ tlb_47x_boltmap);
+ if (mmube0 & MMUBE0_VBE2)
+ __set_bit((mmube0 >> MMUBE0_IBE2_SHIFT) & 0xff,
+ tlb_47x_boltmap);
+ if (mmube1 & MMUBE1_VBE3)
+ __set_bit((mmube1 >> MMUBE1_IBE3_SHIFT) & 0xff,
+ tlb_47x_boltmap);
+ if (mmube1 & MMUBE1_VBE4)
+ __set_bit((mmube1 >> MMUBE1_IBE4_SHIFT) & 0xff,
+ tlb_47x_boltmap);
+ if (mmube1 & MMUBE1_VBE5)
+ __set_bit((mmube1 >> MMUBE1_IBE5_SHIFT) & 0xff,
+ tlb_47x_boltmap);
+}
+
+/*
+ * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
+ */
+static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+{
+ unsigned int rA;
+ int bolted;
+
+ /* Base rA is HW way select, way 0, bolted bit set */
+ rA = 0x88000000;
+
+ /* Look for a bolted entry slot */
+ bolted = ppc47x_find_free_bolted();
+ BUG_ON(bolted < 0);
+
+ /* Insert bolted slot number */
+ rA |= bolted << 24;
+
+ pr_debug("256M TLB entry for 0x%08x->0x%08x in bolt slot %d\n",
+ virt, phys, bolted);
+
+ mtspr(SPRN_MMUCR, 0);
+
+ __asm__ __volatile__(
+ "tlbwe %2,%3,0\n"
+ "tlbwe %1,%3,1\n"
+ "tlbwe %0,%3,2\n"
+ :
+ : "r" (PPC47x_TLB2_SW | PPC47x_TLB2_SR |
+ PPC47x_TLB2_SX
+#ifdef CONFIG_SMP
+ | PPC47x_TLB2_M
+#endif
+ ),
+ "r" (phys),
+ "r" (virt | PPC47x_TLB0_VALID | PPC47x_TLB0_256M),
+ "r" (rA));
+}
+
void __init MMU_init_hw(void)
{
+ /* This is not useful on 47x but won't hurt either */
+ ppc44x_update_tlb_hwater();
+
flush_instruction_cache();
}
-unsigned long __init mmu_mapin_ram(void)
+unsigned long __init mmu_mapin_ram(unsigned long top)
{
unsigned long addr;
+ unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
/* Pin in enough TLBs to cover any lowmem not covered by the
* initial 256M mapping established in head_44x.S */
- for (addr = PPC_PIN_SIZE; addr < total_lowmem;
- addr += PPC_PIN_SIZE)
- ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
+ for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
+ addr += PPC_PIN_SIZE) {
+ if (mmu_has_feature(MMU_FTR_TYPE_47x))
+ ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
+ else
+ ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
+ }
+ if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
+ ppc47x_update_boltmap();
+#ifdef DEBUG
+ {
+ int i;
+
+ printk(KERN_DEBUG "bolted entries: ");
+ for (i = 0; i < 255; i++) {
+ if (test_bit(i, tlb_47x_boltmap))
+ printk("%d ", i);
+ }
+ printk("\n");
+ }
+#endif /* DEBUG */
+ }
return total_lowmem;
}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ u64 size;
+
+#ifndef CONFIG_NONSTATIC_KERNEL
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+#endif
+
+ /* 44x has a 256M TLB entry pinned at boot */
+ size = (min_t(u64, first_memblock_size, PPC_PIN_SIZE));
+ memblock_set_current_limit(first_memblock_base + size);
+}
+
+#ifdef CONFIG_SMP
+void mmu_init_secondary(int cpu)
+{
+ unsigned long addr;
+ unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
+
+ /* Pin in enough TLBs to cover any lowmem not covered by the
+ * initial 256M mapping established in head_44x.S
+ *
+ * WARNING: This is called with only the first 256M of the
+ * linear mapping in the TLB and we can't take faults yet
+ * so beware of what this code uses. It runs off a temporary
+ * stack. current (r2) isn't initialized, smp_processor_id()
+ * will not work, current thread info isn't accessible, ...
+ */
+ for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
+ addr += PPC_PIN_SIZE) {
+ if (mmu_has_feature(MMU_FTR_TYPE_47x))
+ ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
+ else
+ ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
+ }
+}
+#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 41649a5d360..51230ee6a40 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -2,24 +2,37 @@
# Makefile for the linux ppc-specific parts of the memory manager.
#
-ifeq ($(CONFIG_PPC64),y)
-EXTRA_CFLAGS += -mno-minimal-toc
-endif
+subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
+
+ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-obj-y := fault.o mem.o lmb.o \
+obj-y := fault.o mem.o pgtable.o gup.o mmap.o \
init_$(CONFIG_WORD_SIZE).o \
- pgtable_$(CONFIG_WORD_SIZE).o \
- mmu_context_$(CONFIG_WORD_SIZE).o
-hash-$(CONFIG_PPC_NATIVE) := hash_native_64.o
-obj-$(CONFIG_PPC64) += hash_utils_64.o \
- slb_low.o slb.o stab.o mmap.o $(hash-y)
+ pgtable_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
+ tlb_nohash_low.o
+obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
+hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
+obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \
+ slb_low.o slb.o stab.o \
+ $(hash64-y)
obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o
obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \
- tlb_$(CONFIG_WORD_SIZE).o
+ tlb_hash$(CONFIG_WORD_SIZE).o \
+ mmu_context_hash$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_ICSWX) += icswx.o
+obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o
obj-$(CONFIG_40x) += 40x_mmu.o
obj-$(CONFIG_44x) += 44x_mmu.o
-obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o
+obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
obj-$(CONFIG_PPC_MM_SLICES) += slice.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-y += hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
+obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
+obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
+endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
+obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
+obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
new file mode 100644
index 00000000000..7b6c1075017
--- /dev/null
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -0,0 +1,419 @@
+/*
+ * PowerPC version derived from arch/arm/mm/consistent.c
+ * Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
+ *
+ * Copyright (C) 2000 Russell King
+ *
+ * Consistent memory allocators. Used for DMA devices that want to
+ * share uncached memory with the processor core. The function return
+ * is the virtual address and 'dma_handle' is the physical address.
+ * Mostly stolen from the ARM port, with some changes for PowerPC.
+ * -- Dan
+ *
+ * Reorganized to get rid of the arch-specific consistent_* functions
+ * and provide non-coherent implementations for the DMA API. -Matt
+ *
+ * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
+ * implementation. This is pulled straight from ARM and barely
+ * modified. -Matt
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/dma-mapping.h>
+#include <linux/export.h>
+
+#include <asm/tlbflush.h>
+
+#include "mmu_decl.h"
+
+/*
+ * This address range defaults to a value that is safe for all
+ * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
+ * can be further configured for specific applications under
+ * the "Advanced Setup" menu. -Matt
+ */
+#define CONSISTENT_BASE (IOREMAP_TOP)
+#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE)
+#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
+
+/*
+ * This is the page table (2MB) covering uncached, DMA consistent allocations
+ */
+static DEFINE_SPINLOCK(consistent_lock);
+
+/*
+ * VM region handling support.
+ *
+ * This should become something generic, handling VM region allocations for
+ * vmalloc and similar (ioremap, module space, etc).
+ *
+ * I envisage vmalloc()'s supporting vm_struct becoming:
+ *
+ * struct vm_struct {
+ * struct vm_region region;
+ * unsigned long flags;
+ * struct page **pages;
+ * unsigned int nr_pages;
+ * unsigned long phys_addr;
+ * };
+ *
+ * get_vm_area() would then call vm_region_alloc with an appropriate
+ * struct vm_region head (eg):
+ *
+ * struct vm_region vmalloc_head = {
+ * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list),
+ * .vm_start = VMALLOC_START,
+ * .vm_end = VMALLOC_END,
+ * };
+ *
+ * However, vmalloc_head.vm_start is variable (typically, it is dependent on
+ * the amount of RAM found at boot time.) I would imagine that get_vm_area()
+ * would have to initialise this each time prior to calling vm_region_alloc().
+ */
+struct ppc_vm_region {
+ struct list_head vm_list;
+ unsigned long vm_start;
+ unsigned long vm_end;
+};
+
+static struct ppc_vm_region consistent_head = {
+ .vm_list = LIST_HEAD_INIT(consistent_head.vm_list),
+ .vm_start = CONSISTENT_BASE,
+ .vm_end = CONSISTENT_END,
+};
+
+static struct ppc_vm_region *
+ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
+{
+ unsigned long addr = head->vm_start, end = head->vm_end - size;
+ unsigned long flags;
+ struct ppc_vm_region *c, *new;
+
+ new = kmalloc(sizeof(struct ppc_vm_region), gfp);
+ if (!new)
+ goto out;
+
+ spin_lock_irqsave(&consistent_lock, flags);
+
+ list_for_each_entry(c, &head->vm_list, vm_list) {
+ if ((addr + size) < addr)
+ goto nospc;
+ if ((addr + size) <= c->vm_start)
+ goto found;
+ addr = c->vm_end;
+ if (addr > end)
+ goto nospc;
+ }
+
+ found:
+ /*
+ * Insert this entry _before_ the one we found.
+ */
+ list_add_tail(&new->vm_list, &c->vm_list);
+ new->vm_start = addr;
+ new->vm_end = addr + size;
+
+ spin_unlock_irqrestore(&consistent_lock, flags);
+ return new;
+
+ nospc:
+ spin_unlock_irqrestore(&consistent_lock, flags);
+ kfree(new);
+ out:
+ return NULL;
+}
+
+static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
+{
+ struct ppc_vm_region *c;
+
+ list_for_each_entry(c, &head->vm_list, vm_list) {
+ if (c->vm_start == addr)
+ goto out;
+ }
+ c = NULL;
+ out:
+ return c;
+}
+
+/*
+ * Allocate DMA-coherent memory space and return both the kernel remapped
+ * virtual and bus address for that space.
+ */
+void *
+__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp)
+{
+ struct page *page;
+ struct ppc_vm_region *c;
+ unsigned long order;
+ u64 mask = ISA_DMA_THRESHOLD, limit;
+
+ if (dev) {
+ mask = dev->coherent_dma_mask;
+
+ /*
+ * Sanity check the DMA mask - it must be non-zero, and
+ * must be able to be satisfied by a DMA allocation.
+ */
+ if (mask == 0) {
+ dev_warn(dev, "coherent DMA mask is unset\n");
+ goto no_page;
+ }
+
+ if ((~mask) & ISA_DMA_THRESHOLD) {
+ dev_warn(dev, "coherent DMA mask %#llx is smaller "
+ "than system GFP_DMA mask %#llx\n",
+ mask, (unsigned long long)ISA_DMA_THRESHOLD);
+ goto no_page;
+ }
+ }
+
+
+ size = PAGE_ALIGN(size);
+ limit = (mask + 1) & ~mask;
+ if ((limit && size >= limit) ||
+ size >= (CONSISTENT_END - CONSISTENT_BASE)) {
+ printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
+ size, mask);
+ return NULL;
+ }
+
+ order = get_order(size);
+
+ /* Might be useful if we ever have a real legacy DMA zone... */
+ if (mask != 0xffffffff)
+ gfp |= GFP_DMA;
+
+ page = alloc_pages(gfp, order);
+ if (!page)
+ goto no_page;
+
+ /*
+ * Invalidate any data that might be lurking in the
+ * kernel direct-mapped region for device DMA.
+ */
+ {
+ unsigned long kaddr = (unsigned long)page_address(page);
+ memset(page_address(page), 0, size);
+ flush_dcache_range(kaddr, kaddr + size);
+ }
+
+ /*
+ * Allocate a virtual address in the consistent mapping region.
+ */
+ c = ppc_vm_region_alloc(&consistent_head, size,
+ gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
+ if (c) {
+ unsigned long vaddr = c->vm_start;
+ struct page *end = page + (1 << order);
+
+ split_page(page, order);
+
+ /*
+ * Set the "dma handle"
+ */
+ *handle = page_to_phys(page);
+
+ do {
+ SetPageReserved(page);
+ map_page(vaddr, page_to_phys(page),
+ pgprot_noncached(PAGE_KERNEL));
+ page++;
+ vaddr += PAGE_SIZE;
+ } while (size -= PAGE_SIZE);
+
+ /*
+ * Free the otherwise unused pages.
+ */
+ while (page < end) {
+ __free_page(page);
+ page++;
+ }
+
+ return (void *)c->vm_start;
+ }
+
+ if (page)
+ __free_pages(page, order);
+ no_page:
+ return NULL;
+}
+EXPORT_SYMBOL(__dma_alloc_coherent);
+
+/*
+ * free a page as defined by the above mapping.
+ */
+void __dma_free_coherent(size_t size, void *vaddr)
+{
+ struct ppc_vm_region *c;
+ unsigned long flags, addr;
+
+ size = PAGE_ALIGN(size);
+
+ spin_lock_irqsave(&consistent_lock, flags);
+
+ c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
+ if (!c)
+ goto no_area;
+
+ if ((c->vm_end - c->vm_start) != size) {
+ printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
+ __func__, c->vm_end - c->vm_start, size);
+ dump_stack();
+ size = c->vm_end - c->vm_start;
+ }
+
+ addr = c->vm_start;
+ do {
+ pte_t *ptep;
+ unsigned long pfn;
+
+ ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr),
+ addr),
+ addr),
+ addr);
+ if (!pte_none(*ptep) && pte_present(*ptep)) {
+ pfn = pte_pfn(*ptep);
+ pte_clear(&init_mm, addr, ptep);
+ if (pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+ __free_reserved_page(page);
+ }
+ }
+ addr += PAGE_SIZE;
+ } while (size -= PAGE_SIZE);
+
+ flush_tlb_kernel_range(c->vm_start, c->vm_end);
+
+ list_del(&c->vm_list);
+
+ spin_unlock_irqrestore(&consistent_lock, flags);
+
+ kfree(c);
+ return;
+
+ no_area:
+ spin_unlock_irqrestore(&consistent_lock, flags);
+ printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
+ __func__, vaddr);
+ dump_stack();
+}
+EXPORT_SYMBOL(__dma_free_coherent);
+
+/*
+ * make an area consistent.
+ */
+void __dma_sync(void *vaddr, size_t size, int direction)
+{
+ unsigned long start = (unsigned long)vaddr;
+ unsigned long end = start + size;
+
+ switch (direction) {
+ case DMA_NONE:
+ BUG();
+ case DMA_FROM_DEVICE:
+ /*
+ * invalidate only when cache-line aligned otherwise there is
+ * the potential for discarding uncommitted data from the cache
+ */
+ if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1)))
+ flush_dcache_range(start, end);
+ else
+ invalidate_dcache_range(start, end);
+ break;
+ case DMA_TO_DEVICE: /* writeback only */
+ clean_dcache_range(start, end);
+ break;
+ case DMA_BIDIRECTIONAL: /* writeback and invalidate */
+ flush_dcache_range(start, end);
+ break;
+ }
+}
+EXPORT_SYMBOL(__dma_sync);
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * __dma_sync_page() implementation for systems using highmem.
+ * In this case, each page of a buffer must be kmapped/kunmapped
+ * in order to have a virtual address for __dma_sync(). This must
+ * not sleep so kmap_atomic()/kunmap_atomic() are used.
+ *
+ * Note: yes, it is possible and correct to have a buffer extend
+ * beyond the first page.
+ */
+static inline void __dma_sync_page_highmem(struct page *page,
+ unsigned long offset, size_t size, int direction)
+{
+ size_t seg_size = min((size_t)(PAGE_SIZE - offset), size);
+ size_t cur_size = seg_size;
+ unsigned long flags, start, seg_offset = offset;
+ int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE;
+ int seg_nr = 0;
+
+ local_irq_save(flags);
+
+ do {
+ start = (unsigned long)kmap_atomic(page + seg_nr) + seg_offset;
+
+ /* Sync this buffer segment */
+ __dma_sync((void *)start, seg_size, direction);
+ kunmap_atomic((void *)start);
+ seg_nr++;
+
+ /* Calculate next buffer segment size */
+ seg_size = min((size_t)PAGE_SIZE, size - cur_size);
+
+ /* Add the segment size to our running total */
+ cur_size += seg_size;
+ seg_offset = 0;
+ } while (seg_nr < nr_segs);
+
+ local_irq_restore(flags);
+}
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * __dma_sync_page makes memory consistent. identical to __dma_sync, but
+ * takes a struct page instead of a virtual address
+ */
+void __dma_sync_page(struct page *page, unsigned long offset,
+ size_t size, int direction)
+{
+#ifdef CONFIG_HIGHMEM
+ __dma_sync_page_highmem(page, offset, size, direction);
+#else
+ unsigned long start = (unsigned long)page_address(page) + offset;
+ __dma_sync((void *)start, size, direction);
+#endif
+}
+EXPORT_SYMBOL(__dma_sync_page);
+
+/*
+ * Return the PFN for a given cpu virtual address returned by
+ * __dma_alloc_coherent. This is used by dma_mmap_coherent()
+ */
+unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr)
+{
+ /* This should always be populated, so we don't test every
+ * level. If that fails, we'll have a nice crash which
+ * will be as good as a BUG_ON()
+ */
+ pgd_t *pgd = pgd_offset_k(cpu_addr);
+ pud_t *pud = pud_offset(pgd, cpu_addr);
+ pmd_t *pmd = pmd_offset(pud, cpu_addr);
+ pte_t *ptep = pte_offset_kernel(pmd, cpu_addr);
+
+ if (pte_none(*ptep) || !pte_present(*ptep))
+ return 0;
+ return pte_pfn(*ptep);
+}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7b251079926..51ab9e7e6c3 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,16 +29,23 @@
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
+#include <linux/perf_event.h>
+#include <linux/magic.h>
+#include <linux/ratelimit.h>
+#include <linux/context_tracking.h>
+#include <asm/firmware.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/siginfo.h>
+#include <asm/debug.h>
+#include <mm/mmu_decl.h>
+#include "icswx.h"
#ifdef CONFIG_KPROBES
static inline int notify_page_fault(struct pt_regs *regs)
@@ -99,31 +106,80 @@ static int store_updates_sp(struct pt_regs *regs)
}
return 0;
}
+/*
+ * do_page_fault error handling helpers
+ */
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-static void do_dabr(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
+#define MM_FAULT_RETURN 0
+#define MM_FAULT_CONTINUE -1
+#define MM_FAULT_ERR(sig) (sig)
+
+static int do_sigbus(struct pt_regs *regs, unsigned long address)
{
siginfo_t info;
- if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
- 11, SIGSEGV) == NOTIFY_STOP)
- return;
+ up_read(&current->mm->mmap_sem);
- if (debugger_dabr_match(regs))
- return;
+ if (user_mode(regs)) {
+ current->thread.trap_nr = BUS_ADRERR;
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_ADRERR;
+ info.si_addr = (void __user *)address;
+ force_sig_info(SIGBUS, &info, current);
+ return MM_FAULT_RETURN;
+ }
+ return MM_FAULT_ERR(SIGBUS);
+}
+
+static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
+{
+ /*
+ * Pagefault was interrupted by SIGKILL. We have no reason to
+ * continue the pagefault.
+ */
+ if (fatal_signal_pending(current)) {
+ /*
+ * If we have retry set, the mmap semaphore will have
+ * alrady been released in __lock_page_or_retry(). Else
+ * we release it now.
+ */
+ if (!(fault & VM_FAULT_RETRY))
+ up_read(&current->mm->mmap_sem);
+ /* Coming from kernel, we need to deal with uaccess fixups */
+ if (user_mode(regs))
+ return MM_FAULT_RETURN;
+ return MM_FAULT_ERR(SIGKILL);
+ }
+
+ /* No fault: be happy */
+ if (!(fault & VM_FAULT_ERROR))
+ return MM_FAULT_CONTINUE;
+
+ /* Out of memory */
+ if (fault & VM_FAULT_OOM) {
+ up_read(&current->mm->mmap_sem);
- /* Clear the DABR */
- set_dabr(0);
+ /*
+ * We ran out of memory, or some other thing happened to us that
+ * made us unable to handle the page fault gracefully.
+ */
+ if (!user_mode(regs))
+ return MM_FAULT_ERR(SIGKILL);
+ pagefault_out_of_memory();
+ return MM_FAULT_RETURN;
+ }
+
+ /* Bus error. x86 handles HWPOISON here, we'll add this if/when
+ * we support the feature in HW
+ */
+ if (fault & VM_FAULT_SIGBUS)
+ return do_sigbus(regs, addr);
- /* Deliver the signal to userspace */
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_HWBKPT;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGTRAP, &info, current);
+ /* We don't understand the fault code, this is fatal */
+ BUG();
+ return MM_FAULT_CONTINUE;
}
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
/*
* For 600- and 800-family processors, the error_code parameter is DSISR
@@ -141,13 +197,16 @@ static void do_dabr(struct pt_regs *regs, unsigned long address,
int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
+ enum ctx_state prev_state = exception_enter();
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
- siginfo_t info;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
int code = SEGV_MAPERR;
- int is_write = 0, ret;
+ int is_write = 0;
int trap = TRAP(regs);
int is_exec = trap == 0x400;
+ int fault;
+ int rc = 0, store_update_sp = 0;
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
/*
@@ -164,27 +223,49 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
is_write = error_code & ESR_DST;
#endif /* CONFIG_4xx || CONFIG_BOOKE */
+#ifdef CONFIG_PPC_ICSWX
+ /*
+ * we need to do this early because this "data storage
+ * interrupt" does not update the DAR/DEAR so we don't want to
+ * look at it
+ */
+ if (error_code & ICSWX_DSI_UCT) {
+ rc = acop_handle_fault(regs, address, error_code);
+ if (rc)
+ goto bail;
+ }
+#endif /* CONFIG_PPC_ICSWX */
+
if (notify_page_fault(regs))
- return 0;
+ goto bail;
if (unlikely(debugger_fault_handler(regs)))
- return 0;
+ goto bail;
/* On a kernel SLB miss we can only check for a valid exception entry */
- if (!user_mode(regs) && (address >= TASK_SIZE))
- return SIGSEGV;
+ if (!user_mode(regs) && (address >= TASK_SIZE)) {
+ rc = SIGSEGV;
+ goto bail;
+ }
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
+ defined(CONFIG_PPC_BOOK3S_64))
if (error_code & DSISR_DABRMATCH) {
- /* DABR match */
- do_dabr(regs, address, error_code);
- return 0;
+ /* breakpoint match */
+ do_break(regs, address, error_code);
+ goto bail;
}
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
+#endif
+
+ /* We restore the interrupt state now */
+ if (!arch_irq_disabled_regs(regs))
+ local_irq_enable();
if (in_atomic() || mm == NULL) {
- if (!user_mode(regs))
- return SIGSEGV;
+ if (!user_mode(regs)) {
+ rc = SIGSEGV;
+ goto bail;
+ }
/* in_atomic() in user mode is really bad,
as is current->mm == NULL. */
printk(KERN_EMERG "Page fault in user mode with "
@@ -194,6 +275,19 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
die("Weird page fault", regs, SIGSEGV);
}
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ /*
+ * We want to do this outside mmap_sem, because reading code around nip
+ * can result in fault, which will cause a deadlock when called with
+ * mmap_sem held
+ */
+ if (user_mode(regs))
+ store_update_sp = store_updates_sp(regs);
+
+ if (user_mode(regs))
+ flags |= FAULT_FLAG_USER;
+
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -213,7 +307,15 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
if (!user_mode(regs) && !search_exception_tables(regs->nip))
goto bad_area_nosemaphore;
+retry:
down_read(&mm->mmap_sem);
+ } else {
+ /*
+ * The above down_read_trylock() might have succeeded in
+ * which case we'll have missed the might_sleep() from
+ * down_read():
+ */
+ might_sleep();
}
vma = find_vma(mm, address);
@@ -251,8 +353,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
- if (address + 2048 < uregs->gpr[1]
- && (!user_mode(regs) || !store_updates_sp(regs)))
+ if (address + 2048 < uregs->gpr[1] && !store_update_sp)
goto bad_area;
}
if (expand_stack(vma, address))
@@ -267,6 +368,12 @@ good_area:
goto bad_area;
#endif /* CONFIG_6xx */
#if defined(CONFIG_8xx)
+ /* 8xx sometimes need to load a invalid/non-present TLBs.
+ * These must be invalidated separately as linux mm don't.
+ */
+ if (error_code & 0x40000000) /* no translation? */
+ _tlbil_va(address, 0, 0, 0);
+
/* The MPC8xx seems to always set 0x80000000, which is
* "undefined". Of those that can be set, this is the only
* one which seems bad.
@@ -277,48 +384,38 @@ good_area:
#endif /* CONFIG_8xx */
if (is_exec) {
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
- /* protection fault */
+#ifdef CONFIG_PPC_STD_MMU
+ /* Protection fault on exec go straight to failure on
+ * Hash based MMUs as they either don't support per-page
+ * execute permission, or if they do, it's handled already
+ * at the hash level. This test would probably have to
+ * be removed if we change the way this works to make hash
+ * processors use the same I/D cache coherency mechanism
+ * as embedded.
+ */
if (error_code & DSISR_PROTFAULT)
goto bad_area;
+#endif /* CONFIG_PPC_STD_MMU */
+
/*
* Allow execution from readable areas if the MMU does not
* provide separate controls over reading and executing.
+ *
+ * Note: That code used to not be enabled for 4xx/BookE.
+ * It is now as I/D cache coherency for these is done at
+ * set_pte_at() time and I see no reason why the test
+ * below wouldn't be valid on those processors. This -may-
+ * break programs compiled with a really old ABI though.
*/
if (!(vma->vm_flags & VM_EXEC) &&
(cpu_has_feature(CPU_FTR_NOEXECUTE) ||
!(vma->vm_flags & (VM_READ | VM_WRITE))))
goto bad_area;
-#else
- pte_t *ptep;
- pmd_t *pmdp;
-
- /* Since 4xx/Book-E supports per-page execute permission,
- * we lazily flush dcache to icache. */
- ptep = NULL;
- if (get_pteptr(mm, address, &ptep, &pmdp)) {
- spinlock_t *ptl = pte_lockptr(mm, pmdp);
- spin_lock(ptl);
- if (pte_present(*ptep)) {
- struct page *page = pte_page(*ptep);
-
- if (!test_bit(PG_arch_1, &page->flags)) {
- flush_dcache_icache_page(page);
- set_bit(PG_arch_1, &page->flags);
- }
- pte_update(ptep, 0, _PAGE_HWEXEC);
- _tlbie(address, mm->context.id);
- pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
- return 0;
- }
- pte_unmap_unlock(ptep, ptl);
- }
-#endif
/* a write */
} else if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
+ flags |= FAULT_FLAG_WRITE;
/* a read */
} else {
/* protection fault */
@@ -333,21 +430,52 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- survive:
- ret = handle_mm_fault(mm, vma, address, is_write);
- if (unlikely(ret & VM_FAULT_ERROR)) {
- if (ret & VM_FAULT_OOM)
- goto out_of_memory;
- else if (ret & VM_FAULT_SIGBUS)
- goto do_sigbus;
- BUG();
+ fault = handle_mm_fault(mm, vma, address, flags);
+ if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+ rc = mm_fault_error(regs, address, fault);
+ if (rc >= MM_FAULT_RETURN)
+ goto bail;
+ else
+ rc = 0;
}
- if (ret & VM_FAULT_MAJOR)
- current->maj_flt++;
- else
- current->min_flt++;
+
+ /*
+ * Major/minor page fault accounting is only done on the
+ * initial attempt. If we go through a retry, it is extremely
+ * likely that the page will be found in page cache at that point.
+ */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ if (fault & VM_FAULT_MAJOR) {
+ current->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+ regs, address);
+#ifdef CONFIG_PPC_SMLPAR
+ if (firmware_has_feature(FW_FEATURE_CMO)) {
+ u32 page_ins;
+
+ preempt_disable();
+ page_ins = be32_to_cpu(get_lppaca()->page_ins);
+ page_ins += 1 << PAGE_FACTOR;
+ get_lppaca()->page_ins = cpu_to_be32(page_ins);
+ preempt_enable();
+ }
+#endif /* CONFIG_PPC_SMLPAR */
+ } else {
+ current->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+ regs, address);
+ }
+ if (fault & VM_FAULT_RETRY) {
+ /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+ * of starvation. */
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+ goto retry;
+ }
+ }
+
up_read(&mm->mmap_sem);
- return 0;
+ goto bail;
bad_area:
up_read(&mm->mmap_sem);
@@ -356,44 +484,20 @@ bad_area_nosemaphore:
/* User mode accesses cause a SIGSEGV */
if (user_mode(regs)) {
_exception(SIGSEGV, regs, code, address);
- return 0;
+ goto bail;
}
- if (is_exec && (error_code & DSISR_PROTFAULT)
- && printk_ratelimit())
- printk(KERN_CRIT "kernel tried to execute NX-protected"
- " page (%lx) - exploit attempt? (uid: %d)\n",
- address, current->uid);
+ if (is_exec && (error_code & DSISR_PROTFAULT))
+ printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
+ " page (%lx) - exploit attempt? (uid: %d)\n",
+ address, from_kuid(&init_user_ns, current_uid()));
- return SIGSEGV;
+ rc = SIGSEGV;
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
- up_read(&mm->mmap_sem);
- if (is_global_init(current)) {
- yield();
- down_read(&mm->mmap_sem);
- goto survive;
- }
- printk("VM: killing process %s\n", current->comm);
- if (user_mode(regs))
- do_group_exit(SIGKILL);
- return SIGKILL;
+bail:
+ exception_exit(prev_state);
+ return rc;
-do_sigbus:
- up_read(&mm->mmap_sem);
- if (user_mode(regs)) {
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGBUS, &info, current);
- return 0;
- }
- return SIGBUS;
}
/*
@@ -404,6 +508,7 @@ do_sigbus:
void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
{
const struct exception_table_entry *entry;
+ unsigned long *stackend;
/* Are we prepared to handle this fault? */
if ((entry = search_exception_tables(regs->nip)) != NULL) {
@@ -432,5 +537,9 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
regs->nip);
+ stackend = end_of_stack(current);
+ if (current != &init_task && *stackend != STACK_END_MAGIC)
+ printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
die("Kernel access of bad area", regs, sig);
}
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index c93a966b7e4..94cd728166d 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -2,7 +2,7 @@
* Modifications by Kumar Gala (galak@kernel.crashing.org) to support
* E500 Book E processors.
*
- * Copyright 2004 Freescale Semiconductor, Inc
+ * Copyright 2004,2010 Freescale Semiconductor, Inc.
*
* This file contains the routines for initializing the MMU
* on the 4xx series of chips.
@@ -40,6 +40,7 @@
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/highmem.h>
+#include <linux/memblock.h>
#include <asm/pgalloc.h>
#include <asm/prom.h>
@@ -49,41 +50,34 @@
#include <asm/mmu.h>
#include <asm/uaccess.h>
#include <asm/smp.h>
-#include <asm/bootx.h>
#include <asm/machdep.h>
#include <asm/setup.h>
+#include <asm/paca.h>
+
+#include "mmu_decl.h"
-extern void loadcam_entry(unsigned int index);
unsigned int tlbcam_index;
-unsigned int num_tlbcam_entries;
-static unsigned long __cam0, __cam1, __cam2;
-extern unsigned long total_lowmem;
-extern unsigned long __max_low_memory;
-extern unsigned long __initial_memory_limit;
-#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE
-
-#define NUM_TLBCAMS (16)
-
-struct tlbcam {
- u32 MAS0;
- u32 MAS1;
- u32 MAS2;
- u32 MAS3;
- u32 MAS7;
-} TLBCAM[NUM_TLBCAMS];
+
+#define NUM_TLBCAMS (64)
+struct tlbcam TLBCAM[NUM_TLBCAMS];
struct tlbcamrange {
- unsigned long start;
+ unsigned long start;
unsigned long limit;
phys_addr_t phys;
} tlbcam_addrs[NUM_TLBCAMS];
extern unsigned int tlbcam_index;
+unsigned long tlbcam_sz(int idx)
+{
+ return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
+}
+
/*
* Return PA for this VA if it is mapped by a CAM, or 0
*/
-unsigned long v_mapped_by_tlbcam(unsigned long va)
+phys_addr_t v_mapped_by_tlbcam(unsigned long va)
{
int b;
for (b = 0; b < tlbcam_index; ++b)
@@ -95,29 +89,30 @@ unsigned long v_mapped_by_tlbcam(unsigned long va)
/*
* Return VA for a given PA or 0 if not mapped
*/
-unsigned long p_mapped_by_tlbcam(unsigned long pa)
+unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
{
int b;
for (b = 0; b < tlbcam_index; ++b)
if (pa >= tlbcam_addrs[b].phys
- && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
+ && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
+tlbcam_addrs[b].phys)
return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
return 0;
}
/*
- * Set up one of the I/D BAT (block address translation) register pairs.
- * The parameters are not checked; in particular size must be a power
- * of 4 between 4k and 256M.
+ * Set up a variable-size TLB entry (tlbcam). The parameters are not checked;
+ * in particular size must be a power of 4 between 4k and the max supported by
+ * an implementation; max may further be limited by what can be represented in
+ * an unsigned long (for example, 32-bit implementations cannot support a 4GB
+ * size).
*/
-void settlbcam(int index, unsigned long virt, phys_addr_t phys,
- unsigned int size, int flags, unsigned int pid)
+static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
+ unsigned long size, unsigned long flags, unsigned int pid)
{
- unsigned int tsize, lz;
+ unsigned int tsize;
- asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size));
- tsize = (21 - lz) / 2;
+ tsize = __ilog2(size) - 10;
#ifdef CONFIG_SMP
if ((flags & _PAGE_NO_CACHE) == 0)
@@ -134,18 +129,16 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
- TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR;
+ TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+ if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+ TLBCAM[index].MAS7 = (u64)phys >> 32;
-#ifndef CONFIG_KGDB /* want user access for breakpoints */
- if (flags & _PAGE_USER) {
+ /* Below is unlikely -- only for large user pages or similar */
+ if (pte_user(flags)) {
TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
}
-#else
- TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
- TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
-#endif
tlbcam_addrs[index].start = virt;
tlbcam_addrs[index].limit = virt + size - 1;
@@ -154,27 +147,77 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
loadcam_entry(index);
}
-void invalidate_tlbcam_entry(int index)
+unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
+ phys_addr_t phys)
{
- TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index);
- TLBCAM[index].MAS1 = ~MAS1_VALID;
+ unsigned int camsize = __ilog2(ram);
+ unsigned int align = __ffs(virt | phys);
+ unsigned long max_cam;
- loadcam_entry(index);
+ if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+ /* Convert (4^max) kB to (2^max) bytes */
+ max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10;
+ camsize &= ~1U;
+ align &= ~1U;
+ } else {
+ /* Convert (2^max) kB to (2^max) bytes */
+ max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10;
+ }
+
+ if (camsize > align)
+ camsize = align;
+ if (camsize > max_cam)
+ camsize = max_cam;
+
+ return 1UL << camsize;
}
-void __init cam_mapin_ram(unsigned long cam0, unsigned long cam1,
- unsigned long cam2)
+static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
+ unsigned long ram, int max_cam_idx)
{
- settlbcam(0, PAGE_OFFSET, PPC_MEMSTART, cam0, _PAGE_KERNEL, 0);
- tlbcam_index++;
- if (cam1) {
- tlbcam_index++;
- settlbcam(1, PAGE_OFFSET+cam0, PPC_MEMSTART+cam0, cam1, _PAGE_KERNEL, 0);
- }
- if (cam2) {
- tlbcam_index++;
- settlbcam(2, PAGE_OFFSET+cam0+cam1, PPC_MEMSTART+cam0+cam1, cam2, _PAGE_KERNEL, 0);
+ int i;
+ unsigned long amount_mapped = 0;
+
+ /* Calculate CAM values */
+ for (i = 0; ram && i < max_cam_idx; i++) {
+ unsigned long cam_sz;
+
+ cam_sz = calc_cam_sz(ram, virt, phys);
+ settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0);
+
+ ram -= cam_sz;
+ amount_mapped += cam_sz;
+ virt += cam_sz;
+ phys += cam_sz;
}
+ tlbcam_index = i;
+
+#ifdef CONFIG_PPC64
+ get_paca()->tcd.esel_next = i;
+ get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+ get_paca()->tcd.esel_first = i;
+#endif
+
+ return amount_mapped;
+}
+
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+{
+ unsigned long virt = PAGE_OFFSET;
+ phys_addr_t phys = memstart_addr;
+
+ return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx);
+}
+
+#ifdef CONFIG_PPC32
+
+#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
+#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
+#endif
+
+unsigned long __init mmu_mapin_ram(unsigned long top)
+{
+ return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
}
/*
@@ -185,53 +228,91 @@ void __init MMU_init_hw(void)
flush_instruction_cache();
}
-unsigned long __init mmu_mapin_ram(void)
+void __init adjust_total_lowmem(void)
{
- cam_mapin_ram(__cam0, __cam1, __cam2);
+ unsigned long ram;
+ int i;
- return __cam0 + __cam1 + __cam2;
-}
+ /* adjust lowmem size to __max_low_memory */
+ ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
+ i = switch_to_as1();
+ __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
+ restore_to_as0(i, 0, 0, 1);
-void __init
-adjust_total_lowmem(void)
+ pr_info("Memory CAM mapping: ");
+ for (i = 0; i < tlbcam_index - 1; i++)
+ pr_cont("%lu/", tlbcam_sz(i) >> 20);
+ pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20,
+ (unsigned int)((total_lowmem - __max_low_memory) >> 20));
+
+ memblock_set_current_limit(memstart_addr + __max_low_memory);
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
{
- unsigned long max_low_mem = MAX_LOW_MEM;
- unsigned long cam_max = 0x10000000;
- unsigned long ram;
+ phys_addr_t limit = first_memblock_base + first_memblock_size;
- /* adjust CAM size to max_low_mem */
- if (max_low_mem < cam_max)
- cam_max = max_low_mem;
+ /* 64M mapped initially according to head_fsl_booke.S */
+ memblock_set_current_limit(min_t(u64, limit, 0x04000000));
+}
- /* adjust lowmem size to max_low_mem */
- if (max_low_mem < total_lowmem)
- ram = max_low_mem;
- else
- ram = total_lowmem;
+#ifdef CONFIG_RELOCATABLE
+int __initdata is_second_reloc;
+notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
+{
+ unsigned long base = KERNELBASE;
- /* Calculate CAM values */
- __cam0 = 1UL << 2 * (__ilog2(ram) / 2);
- if (__cam0 > cam_max)
- __cam0 = cam_max;
- ram -= __cam0;
- if (ram) {
- __cam1 = 1UL << 2 * (__ilog2(ram) / 2);
- if (__cam1 > cam_max)
- __cam1 = cam_max;
- ram -= __cam1;
- }
- if (ram) {
- __cam2 = 1UL << 2 * (__ilog2(ram) / 2);
- if (__cam2 > cam_max)
- __cam2 = cam_max;
- ram -= __cam2;
+ kernstart_addr = start;
+ if (is_second_reloc) {
+ virt_phys_offset = PAGE_OFFSET - memstart_addr;
+ return;
}
- printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb,"
- " CAM2=%ldMb residual: %ldMb\n",
- __cam0 >> 20, __cam1 >> 20, __cam2 >> 20,
- (total_lowmem - __cam0 - __cam1 - __cam2) >> 20);
- __max_low_memory = max_low_mem = __cam0 + __cam1 + __cam2;
- __initial_memory_limit = __max_low_memory;
+ /*
+ * Relocatable kernel support based on processing of dynamic
+ * relocation entries. Before we get the real memstart_addr,
+ * We will compute the virt_phys_offset like this:
+ * virt_phys_offset = stext.run - kernstart_addr
+ *
+ * stext.run = (KERNELBASE & ~0x3ffffff) +
+ * (kernstart_addr & 0x3ffffff)
+ * When we relocate, we have :
+ *
+ * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff)
+ *
+ * hence:
+ * virt_phys_offset = (KERNELBASE & ~0x3ffffff) -
+ * (kernstart_addr & ~0x3ffffff)
+ *
+ */
+ start &= ~0x3ffffff;
+ base &= ~0x3ffffff;
+ virt_phys_offset = base - start;
+ early_get_first_memblock_info(__va(dt_ptr), NULL);
+ /*
+ * We now get the memstart_addr, then we should check if this
+ * address is the same as what the PAGE_OFFSET map to now. If
+ * not we have to change the map of PAGE_OFFSET to memstart_addr
+ * and do a second relocation.
+ */
+ if (start != memstart_addr) {
+ int n;
+ long offset = start - memstart_addr;
+
+ is_second_reloc = 1;
+ n = switch_to_as1();
+ /* map a 64M area for the second relocation */
+ if (memstart_addr > start)
+ map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM);
+ else
+ map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
+ 0x4000000, CONFIG_LOWMEM_CAM_NUM);
+ restore_to_as0(n, offset, __va(dt_ptr), 1);
+ /* We should never reach here */
+ panic("Relocation error");
+ }
}
+#endif
+#endif
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
new file mode 100644
index 00000000000..d8746684f60
--- /dev/null
+++ b/arch/powerpc/mm/gup.c
@@ -0,0 +1,235 @@
+/*
+ * Lockless get_user_pages_fast for powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#undef DEBUG
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask, result;
+ pte_t *ptep;
+
+ result = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ result |= _PAGE_RW;
+ mask = result | _PAGE_SPECIAL;
+
+ ptep = pte_offset_kernel(&pmd, addr);
+ do {
+ pte_t pte = ACCESS_ONCE(*ptep);
+ struct page *page;
+ /*
+ * Similar to the PMD case, NUMA hinting must take slow path
+ */
+ if (pte_numa(pte))
+ return 0;
+
+ if ((pte_val(pte) & mask) != result)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+ if (!page_cache_get_speculative(page))
+ return 0;
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ put_page(page);
+ return 0;
+ }
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+
+ return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+
+ pmdp = pmd_offset(&pud, addr);
+ do {
+ pmd_t pmd = ACCESS_ONCE(*pmdp);
+
+ next = pmd_addr_end(addr, end);
+ /*
+ * If we find a splitting transparent hugepage we
+ * return zero. That will result in taking the slow
+ * path which will call wait_split_huge_page()
+ * if the pmd is still in splitting state
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ return 0;
+ if (pmd_huge(pmd) || pmd_large(pmd)) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_numa(pmd))
+ return 0;
+
+ if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (is_hugepd(pmdp)) {
+ if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
+ addr, next, write, pages, nr))
+ return 0;
+ } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ return 0;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp;
+
+ pudp = pud_offset(&pgd, addr);
+ do {
+ pud_t pud = ACCESS_ONCE(*pudp);
+
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (pud_huge(pud)) {
+ if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (is_hugepd(pudp)) {
+ if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
+ addr, next, write, pages, nr))
+ return 0;
+ } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ return 0;
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next;
+ unsigned long flags;
+ pgd_t *pgdp;
+ int nr = 0;
+
+ pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ start, len)))
+ return 0;
+
+ pr_devel(" aligned: %lx .. %lx\n", start, end);
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables from being freed on powerpc.
+ *
+ * So long as we atomically load page table pointers versus teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+ local_irq_save(flags);
+
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = ACCESS_ONCE(*pgdp);
+
+ pr_devel(" %016lx: normal pgd %p\n", addr,
+ (void *)pgd_val(pgd));
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ break;
+ if (pgd_huge(pgd)) {
+ if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
+ write, pages, &nr))
+ break;
+ } else if (is_hugepd(pgdp)) {
+ if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
+ addr, next, write, pages, &nr))
+ break;
+ } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ break;
+ } while (pgdp++, addr = next, addr != end);
+
+ local_irq_restore(flags);
+
+ return nr;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ int nr, ret;
+
+ start &= PAGE_MASK;
+ nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ ret = nr;
+
+ if (nr < nr_pages) {
+ pr_devel(" slow path ! nr = %d\n", nr);
+
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start,
+ nr_pages - nr, write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
+ }
+ }
+
+ return ret;
+}
+
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index e10d76a860d..115347f74ce 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -1,6 +1,4 @@
/*
- * $Id: hashtable.S,v 1.6 1999/10/08 01:56:15 paulus Exp $
- *
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
* Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
@@ -38,46 +36,16 @@ mmu_hash_lock:
#endif /* CONFIG_SMP */
/*
- * Sync CPUs with hash_page taking & releasing the hash
- * table lock
- */
-#ifdef CONFIG_SMP
- .text
-_GLOBAL(hash_page_sync)
- mfmsr r10
- rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
- mtmsr r0
- lis r8,mmu_hash_lock@h
- ori r8,r8,mmu_hash_lock@l
- lis r0,0x0fff
- b 10f
-11: lwz r6,0(r8)
- cmpwi 0,r6,0
- bne 11b
-10: lwarx r6,0,r8
- cmpwi 0,r6,0
- bne- 11b
- stwcx. r0,0,r8
- bne- 10b
- isync
- eieio
- li r0,0
- stw r0,0(r8)
- mtmsr r10
- blr
-#endif /* CONFIG_SMP */
-
-/*
* Load a PTE into the hash table, if possible.
* The address is in r4, and r3 contains an access flag:
* _PAGE_RW (0x400) if a write.
* r9 contains the SRR1 value, from which we use the MSR_PR bit.
- * SPRG3 contains the physical address of the current task's thread.
+ * SPRG_THREAD contains the physical address of the current task's thread.
*
* Returns to the caller if the access is illegal or there is no
* mapping for the address. Otherwise it places an appropriate PTE
* in the hash table and returns from the exception.
- * Uses r0, r3 - r8, ctr, lr.
+ * Uses r0, r3 - r8, r10, ctr, lr.
*/
.text
_GLOBAL(hash_page)
@@ -100,7 +68,7 @@ _GLOBAL(hash_page)
/* Get PTE (linux-style) and check access */
lis r0,KERNELBASE@h /* check if kernel address */
cmplw 0,r4,r0
- mfspr r8,SPRN_SPRG3 /* current task's THREAD (phys) */
+ mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
lwz r5,PGDIR(r8) /* virt page-table root */
blt+ 112f /* assume user more likely */
@@ -108,9 +76,15 @@ _GLOBAL(hash_page)
addi r5,r5,swapper_pg_dir@l /* kernel page table */
rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
112: add r5,r5,r7 /* convert to phys addr */
+#ifndef CONFIG_PTE_64BIT
rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
lwz r8,0(r5) /* get pmd entry */
rlwinm. r8,r8,0,0,19 /* extract address of pte page */
+#else
+ rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */
+ lwzx r8,r8,r5 /* Get L1 entry */
+ rlwinm. r8,r8,0,0,20 /* extract pt base address */
+#endif
#ifdef CONFIG_SMP
beq- hash_page_out /* return if no mapping */
#else
@@ -120,7 +94,11 @@ _GLOBAL(hash_page)
to the address following the rfi. */
beqlr-
#endif
+#ifndef CONFIG_PTE_64BIT
rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */
+#else
+ rlwimi r8,r4,23,20,28 /* compute pte address */
+#endif
rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */
ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
@@ -129,9 +107,15 @@ _GLOBAL(hash_page)
* because almost always, there won't be a permission violation
* and there won't already be an HPTE, and thus we will have
* to update the PTE to set _PAGE_HASHPTE. -- paulus.
+ *
+ * If PTE_64BIT is set, the low word is the flags word; use that
+ * word for locking since it contains all the interesting bits.
*/
+#if (PTE_FLAGS_OFFSET != 0)
+ addi r8,r8,PTE_FLAGS_OFFSET
+#endif
retry:
- lwarx r6,0,r8 /* get linux-style pte */
+ lwarx r6,0,r8 /* get linux-style pte, flag word */
andc. r5,r3,r6 /* check access & ~permission */
#ifdef CONFIG_SMP
bne- hash_page_out /* return if access not permitted */
@@ -139,6 +123,15 @@ retry:
bnelr-
#endif
or r5,r0,r6 /* set accessed/dirty bits */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+ subf r10,r6,r8 /* create false data dependency */
+ subi r10,r10,PTE_FLAGS_OFFSET
+ lwzx r10,r6,r10 /* Get upper PTE word */
+#else
+ lwz r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
stwcx. r5,0,r8 /* attempt to update PTE */
bne- retry /* retry if someone got there first */
@@ -191,7 +184,7 @@ _GLOBAL(add_hash_page)
add r3,r3,r0 /* note create_hpte trims to 24 bits */
#ifdef CONFIG_SMP
- rlwinm r8,r1,0,0,18 /* use cpu number to make tag */
+ CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */
lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */
oris r8,r8,12
#endif /* CONFIG_SMP */
@@ -205,9 +198,9 @@ _GLOBAL(add_hash_page)
* we can't take a hash table miss (assuming the code is
* covered by a BAT). -- paulus
*/
- mfmsr r10
+ mfmsr r9
SYNC
- rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
+ rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear MSR_DR */
mtmsr r0
SYNC_601
@@ -216,14 +209,14 @@ _GLOBAL(add_hash_page)
tophys(r7,0)
#ifdef CONFIG_SMP
- addis r9,r7,mmu_hash_lock@ha
- addi r9,r9,mmu_hash_lock@l
-10: lwarx r0,0,r9 /* take the mmu_hash_lock */
+ addis r6,r7,mmu_hash_lock@ha
+ addi r6,r6,mmu_hash_lock@l
+10: lwarx r0,0,r6 /* take the mmu_hash_lock */
cmpi 0,r0,0
bne- 11f
- stwcx. r8,0,r9
+ stwcx. r8,0,r6
beq+ 12f
-11: lwz r0,0(r9)
+11: lwz r0,0(r6)
cmpi 0,r0,0
beq 10b
b 11b
@@ -236,10 +229,24 @@ _GLOBAL(add_hash_page)
* HPTE, so we just unlock and return.
*/
mr r8,r5
+#ifndef CONFIG_PTE_64BIT
rlwimi r8,r4,22,20,29
+#else
+ rlwimi r8,r4,23,20,28
+ addi r8,r8,PTE_FLAGS_OFFSET
+#endif
1: lwarx r6,0,r8
andi. r0,r6,_PAGE_HASHPTE
bne 9f /* if HASHPTE already set, done */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+ subf r10,r6,r8 /* create false data dependency */
+ subi r10,r10,PTE_FLAGS_OFFSET
+ lwzx r10,r6,r10 /* Get upper PTE word */
+#else
+ lwz r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
ori r5,r6,_PAGE_HASHPTE
stwcx. r5,0,r8
bne- 1b
@@ -248,13 +255,15 @@ _GLOBAL(add_hash_page)
9:
#ifdef CONFIG_SMP
+ addis r6,r7,mmu_hash_lock@ha
+ addi r6,r6,mmu_hash_lock@l
eieio
li r0,0
- stw r0,0(r9) /* clear mmu_hash_lock */
+ stw r0,0(r6) /* clear mmu_hash_lock */
#endif
/* reenable interrupts and DR */
- mtmsr r10
+ mtmsr r9
SYNC_601
isync
@@ -269,7 +278,8 @@ _GLOBAL(add_hash_page)
* r5 contains the linux PTE, r6 contains the old value of the
* linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
* offset to be added to addresses (0 if the MMU is on,
- * -KERNELBASE if it is off).
+ * -KERNELBASE if it is off). r10 contains the upper half of
+ * the PTE if CONFIG_PTE_64BIT.
* On SMP, the caller should have the mmu_hash_lock held.
* We assume that the caller has (or will) set the _PAGE_HASHPTE
* bit in the linux PTE in memory. The value passed in r6 should
@@ -287,7 +297,7 @@ Hash_bits = 12 /* e.g. 256kB hash table */
Hash_msk = (((1 << Hash_bits) - 1) * 64)
/* defines for the PTE format for 32-bit PPCs */
-#define PTE_SIZE 8
+#define HPTE_SIZE 8
#define PTEG_SIZE 64
#define LG_PTEG_SIZE 6
#define LDPTEu lwzu
@@ -310,11 +320,16 @@ _GLOBAL(create_hpte)
and r8,r8,r0 /* writable if _RW & _DIRTY */
rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */
rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */
- ori r8,r8,0xe14 /* clear out reserved bits and M */
+ ori r8,r8,0xe04 /* clear out reserved bits */
andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */
BEGIN_FTR_SECTION
- ori r8,r8,_PAGE_COHERENT /* set M (coherence required) */
-END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT)
+ rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */
+END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
+#ifdef CONFIG_PTE_64BIT
+ /* Put the XPN bits into the PTE */
+ rlwimi r8,r10,8,20,22
+ rlwimi r8,r10,2,29,29
+#endif
/* Construct the high word of the PPC-style PTE (r5) */
rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */
@@ -344,8 +359,8 @@ _GLOBAL(hash_page_patch_A)
/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
mtctr r0
- addi r4,r3,-PTE_SIZE
-1: LDPTEu r6,PTE_SIZE(r4) /* get next PTE */
+ addi r4,r3,-HPTE_SIZE
+1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */
CMPPTE 0,r6,r5
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
beq+ found_slot
@@ -355,9 +370,9 @@ _GLOBAL(hash_page_patch_A)
_GLOBAL(hash_page_patch_B)
xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
xori r4,r4,(-PTEG_SIZE & 0xffff)
- addi r4,r4,-PTE_SIZE
+ addi r4,r4,-HPTE_SIZE
mtctr r0
-2: LDPTEu r6,PTE_SIZE(r4)
+2: LDPTEu r6,HPTE_SIZE(r4)
CMPPTE 0,r6,r5
bdnzf 2,2b
beq+ found_slot
@@ -365,8 +380,8 @@ _GLOBAL(hash_page_patch_B)
/* Search the primary PTEG for an empty slot */
10: mtctr r0
- addi r4,r3,-PTE_SIZE /* search primary PTEG */
-1: LDPTEu r6,PTE_SIZE(r4) /* get next PTE */
+ addi r4,r3,-HPTE_SIZE /* search primary PTEG */
+1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */
TST_V(r6) /* test valid bit */
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
beq+ found_empty
@@ -382,9 +397,9 @@ _GLOBAL(hash_page_patch_B)
_GLOBAL(hash_page_patch_C)
xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
xori r4,r4,(-PTEG_SIZE & 0xffff)
- addi r4,r4,-PTE_SIZE
+ addi r4,r4,-HPTE_SIZE
mtctr r0
-2: LDPTEu r6,PTE_SIZE(r4)
+2: LDPTEu r6,HPTE_SIZE(r4)
TST_V(r6)
bdnzf 2,2b
beq+ found_empty
@@ -411,11 +426,11 @@ _GLOBAL(hash_page_patch_C)
1: addis r4,r7,next_slot@ha /* get next evict slot */
lwz r6,next_slot@l(r4)
- addi r6,r6,PTE_SIZE /* search for candidate */
- andi. r6,r6,7*PTE_SIZE
+ addi r6,r6,HPTE_SIZE /* search for candidate */
+ andi. r6,r6,7*HPTE_SIZE
stw r6,next_slot@l(r4)
add r4,r3,r6
- LDPTE r0,PTE_SIZE/2(r4) /* get PTE second word */
+ LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */
clrrwi r0,r0,12
lis r6,etext@h
ori r6,r6,etext@l /* get etext */
@@ -428,7 +443,7 @@ _GLOBAL(hash_page_patch_C)
found_empty:
STPTE r5,0(r4)
found_slot:
- STPTE r8,PTE_SIZE/2(r4)
+ STPTE r8,HPTE_SIZE/2(r4)
#else /* CONFIG_SMP */
/*
@@ -454,7 +469,7 @@ found_slot:
STPTE r5,0(r4)
sync
TLBSYNC
- STPTE r8,PTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
+ STPTE r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
sync
SET_V(r5)
STPTE r5,0(r4) /* finally set V bit in PTE */
@@ -501,14 +516,18 @@ _GLOBAL(flush_hash_pages)
isync
/* First find a PTE in the range that has _PAGE_HASHPTE set */
+#ifndef CONFIG_PTE_64BIT
rlwimi r5,r4,22,20,29
-1: lwz r0,0(r5)
+#else
+ rlwimi r5,r4,23,20,28
+#endif
+1: lwz r0,PTE_FLAGS_OFFSET(r5)
cmpwi cr1,r6,1
andi. r0,r0,_PAGE_HASHPTE
bne 2f
ble cr1,19f
addi r4,r4,0x1000
- addi r5,r5,4
+ addi r5,r5,PTE_SIZE
addi r6,r6,-1
b 1b
@@ -526,7 +545,7 @@ _GLOBAL(flush_hash_pages)
#ifdef CONFIG_SMP
addis r9,r7,mmu_hash_lock@ha
addi r9,r9,mmu_hash_lock@l
- rlwinm r8,r1,0,0,18
+ CURRENT_THREAD_INFO(r8, r1)
add r8,r8,r7
lwz r8,TI_CPU(r8)
oris r8,r8,9
@@ -547,7 +566,10 @@ _GLOBAL(flush_hash_pages)
* already clear, we're done (for this pte). If not,
* clear it (atomically) and proceed. -- paulus.
*/
-33: lwarx r8,0,r5 /* fetch the pte */
+#if (PTE_FLAGS_OFFSET != 0)
+ addi r5,r5,PTE_FLAGS_OFFSET
+#endif
+33: lwarx r8,0,r5 /* fetch the pte flags word */
andi. r0,r8,_PAGE_HASHPTE
beq 8f /* done if HASHPTE is already clear */
rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */
@@ -564,8 +586,8 @@ _GLOBAL(flush_hash_patch_A)
/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
li r0,8 /* PTEs/group */
mtctr r0
- addi r12,r8,-PTE_SIZE
-1: LDPTEu r0,PTE_SIZE(r12) /* get next PTE */
+ addi r12,r8,-HPTE_SIZE
+1: LDPTEu r0,HPTE_SIZE(r12) /* get next PTE */
CMPPTE 0,r0,r11
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
beq+ 3f
@@ -576,9 +598,9 @@ _GLOBAL(flush_hash_patch_A)
_GLOBAL(flush_hash_patch_B)
xoris r12,r8,Hash_msk>>16 /* compute secondary hash */
xori r12,r12,(-PTEG_SIZE & 0xffff)
- addi r12,r12,-PTE_SIZE
+ addi r12,r12,-HPTE_SIZE
mtctr r0
-2: LDPTEu r0,PTE_SIZE(r12)
+2: LDPTEu r0,HPTE_SIZE(r12)
CMPPTE 0,r0,r11
bdnzf 2,2b
xori r11,r11,PTE_H /* clear H again */
@@ -592,7 +614,7 @@ _GLOBAL(flush_hash_patch_B)
8: ble cr1,9f /* if all ptes checked */
81: addi r6,r6,-1
- addi r5,r5,4 /* advance to next pte */
+ addi r5,r5,PTE_SIZE
addi r4,r4,0x1000
lwz r0,0(r5) /* check next pte */
cmpwi cr1,r6,1
@@ -611,3 +633,80 @@ _GLOBAL(flush_hash_patch_B)
SYNC_601
isync
blr
+
+/*
+ * Flush an entry from the TLB
+ */
+_GLOBAL(_tlbie)
+#ifdef CONFIG_SMP
+ CURRENT_THREAD_INFO(r8, r1)
+ lwz r8,TI_CPU(r8)
+ oris r8,r8,11
+ mfmsr r10
+ SYNC
+ rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
+ rlwinm r0,r0,0,28,26 /* clear DR */
+ mtmsr r0
+ SYNC_601
+ isync
+ lis r9,mmu_hash_lock@h
+ ori r9,r9,mmu_hash_lock@l
+ tophys(r9,r9)
+10: lwarx r7,0,r9
+ cmpwi 0,r7,0
+ bne- 10b
+ stwcx. r8,0,r9
+ bne- 10b
+ eieio
+ tlbie r3
+ sync
+ TLBSYNC
+ li r0,0
+ stw r0,0(r9) /* clear mmu_hash_lock */
+ mtmsr r10
+ SYNC_601
+ isync
+#else /* CONFIG_SMP */
+ tlbie r3
+ sync
+#endif /* CONFIG_SMP */
+ blr
+
+/*
+ * Flush the entire TLB. 603/603e only
+ */
+_GLOBAL(_tlbia)
+#if defined(CONFIG_SMP)
+ CURRENT_THREAD_INFO(r8, r1)
+ lwz r8,TI_CPU(r8)
+ oris r8,r8,10
+ mfmsr r10
+ SYNC
+ rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
+ rlwinm r0,r0,0,28,26 /* clear DR */
+ mtmsr r0
+ SYNC_601
+ isync
+ lis r9,mmu_hash_lock@h
+ ori r9,r9,mmu_hash_lock@l
+ tophys(r9,r9)
+10: lwarx r7,0,r9
+ cmpwi 0,r7,0
+ bne- 10b
+ stwcx. r8,0,r9
+ bne- 10b
+ sync
+ tlbia
+ sync
+ TLBSYNC
+ li r0,0
+ stw r0,0(r9) /* clear mmu_hash_lock */
+ mtmsr r10
+ SYNC_601
+ isync
+#else /* CONFIG_SMP */
+ sync
+ tlbia
+ sync
+#endif /* CONFIG_SMP */
+ blr
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 21d24848647..057cbbb4c57 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -34,14 +34,6 @@
* | CR save area (SP + 8)
* SP ---> +-- Back chain (SP + 0)
*/
-#define STACKFRAMESIZE 256
-
-/* Save parameters offsets */
-#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8)
-
-/* Save non-volatile offsets */
-#define STK_REG(i) (112 + ((i)-14)*8)
-
#ifndef CONFIG_PPC_64K_PAGES
@@ -64,25 +56,22 @@ _GLOBAL(__hash_page_4K)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
/* Save all params that we need after a function call */
- std r6,STK_PARM(r6)(r1)
- std r8,STK_PARM(r8)(r1)
- std r9,STK_PARM(r9)(r1)
+ std r6,STK_PARAM(R6)(r1)
+ std r8,STK_PARAM(R8)(r1)
+ std r9,STK_PARAM(R9)(r1)
- /* Add _PAGE_PRESENT to access */
- ori r4,r4,_PAGE_PRESENT
-
/* Save non-volatile registers.
* r31 will hold "old PTE"
* r30 is "new PTE"
- * r29 is "va"
+ * r29 is vpn
* r28 is a hash value
* r27 is hashtab mask (maybe dynamic patched instead ?)
*/
- std r27,STK_REG(r27)(r1)
- std r28,STK_REG(r28)(r1)
- std r29,STK_REG(r29)(r1)
- std r30,STK_REG(r30)(r1)
- std r31,STK_REG(r31)(r1)
+ std r27,STK_REG(R27)(r1)
+ std r28,STK_REG(R28)(r1)
+ std r29,STK_REG(R29)(r1)
+ std r30,STK_REG(R30)(r1)
+ std r31,STK_REG(R31)(r1)
/* Step 1:
*
@@ -121,26 +110,34 @@ _GLOBAL(__hash_page_4K)
BEGIN_FTR_SECTION
cmpdi r9,0 /* check segment size */
bne 3f
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
- /* Calc va and put it in r29 */
- rldicr r29,r5,28,63-28
- rldicl r3,r3,0,36
- or r29,r3,r29
-
- /* Calculate hash value for primary slot and store it in r28 */
- rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
- rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
- xor r28,r5,r0
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
+ /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT - VPN_SHIFT
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
+ or r29,r28,r29
+ /*
+ * Calculate hash value for primary slot and store it in r28
+ * r3 = va, r5 = vsid
+ * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
+ */
+ rldicl r0,r3,64-12,48
+ xor r28,r5,r0 /* hash */
b 4f
-3: /* Calc VA and hash in r29 and r28 for 1T segment */
- sldi r29,r5,40 /* vsid << 40 */
- clrldi r3,r3,24 /* ea & 0xffffffffff */
- rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */
- clrldi r5,r5,40 /* vsid & 0xffffff */
- rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */
- xor r28,r28,r5
- or r29,r3,r29 /* VA */
+3: /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
+ or r29,r28,r29
+
+ /*
+ * calculate hash value for primary slot and
+ * store it in r28 for 1T segment
+ * r3 = va, r5 = vsid
+ */
+ sldi r28,r5,25 /* vsid << 25 */
+ /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
+ rldicl r0,r3,64-12,36
+ xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
xor r28,r28,r0 /* hash */
/* Convert linux PTE bits into HW equivalents */
@@ -151,7 +148,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
andc r0,r30,r0 /* r0 = pte & ~r0 */
rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ ori r3,r3,HPTE_R_C | HPTE_R_M
/* We eventually do the icache sync here (maybe inline that
* code rather than call a C function...)
@@ -159,13 +159,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
BEGIN_FTR_SECTION
mr r4,r30
mr r5,r7
- bl .hash_page_do_lazy_icache
+ bl hash_page_do_lazy_icache
END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* At this point, r3 contains new PP bits, save them in
* place of "access" in the param area (sic)
*/
- std r3,STK_PARM(r4)(r1)
+ std r3,STK_PARAM(R4)(r1)
/* Get htab_hash_mask */
ld r4,htab_hash_mask@got(2)
@@ -195,12 +195,14 @@ htab_insert_pte:
rldicr r3,r0,3,63-3 /* r3 = (hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */
- mr r4,r29 /* Retreive va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
bl . /* Patched by htab_finish_init() */
cmpdi 0,r3,0
bge htab_pte_insert_ok /* Insertion successful */
@@ -218,12 +220,14 @@ _GLOBAL(htab_call_hpte_insert1)
rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */
- mr r4,r29 /* Retreive va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
bl . /* Patched by htab_finish_init() */
cmpdi 0,r3,0
bge+ htab_pte_insert_ok /* Insertion successful */
@@ -240,7 +244,8 @@ _GLOBAL(htab_call_hpte_insert2)
2: and r0,r5,r27
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
bl . /* Patched by htab_finish_init() */
/* Try all again */
@@ -258,15 +263,15 @@ htab_pte_insert_ok:
* (maybe add eieio may be good still ?)
*/
htab_write_out_pte:
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r30,0(r6)
li r3, 0
htab_bail:
- ld r27,STK_REG(r27)(r1)
- ld r28,STK_REG(r28)(r1)
- ld r29,STK_REG(r29)(r1)
- ld r30,STK_REG(r30)(r1)
- ld r31,STK_REG(r31)(r1)
+ ld r27,STK_REG(R27)(r1)
+ ld r28,STK_REG(R28)(r1)
+ ld r29,STK_REG(R29)(r1)
+ ld r30,STK_REG(R30)(r1)
+ ld r31,STK_REG(R31)(r1)
addi r1,r1,STACKFRAMESIZE
ld r0,16(r1)
mtlr r0
@@ -289,11 +294,13 @@ htab_modify_pte:
add r3,r0,r3 /* add slot idx */
/* Call ppc_md.hpte_updatepp */
- mr r5,r29 /* va */
- li r6,MMU_PAGE_4K /* page size */
- ld r7,STK_PARM(r9)(r1) /* segment size */
- ld r8,STK_PARM(r8)(r1) /* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+ mr r5,r29 /* vpn */
+ li r6,MMU_PAGE_4K /* base page size */
+ li r7,MMU_PAGE_4K /* actual page size */
+ ld r8,STK_PARAM(R9)(r1) /* segment size */
+ ld r9,STK_PARAM(R8)(r1) /* get "local" param */
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
bl . /* Patched by htab_finish_init() */
/* if we failed because typically the HPTE wasn't really here
@@ -315,7 +322,7 @@ htab_wrong_access:
htab_pte_insert_failure:
/* Bail out restoring old PTE */
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r31,0(r6)
li r3,-1
b htab_bail
@@ -343,29 +350,26 @@ _GLOBAL(__hash_page_4K)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
/* Save all params that we need after a function call */
- std r6,STK_PARM(r6)(r1)
- std r8,STK_PARM(r8)(r1)
- std r9,STK_PARM(r9)(r1)
-
- /* Add _PAGE_PRESENT to access */
- ori r4,r4,_PAGE_PRESENT
+ std r6,STK_PARAM(R6)(r1)
+ std r8,STK_PARAM(R8)(r1)
+ std r9,STK_PARAM(R9)(r1)
/* Save non-volatile registers.
* r31 will hold "old PTE"
* r30 is "new PTE"
- * r29 is "va"
+ * r29 is vpn
* r28 is a hash value
* r27 is hashtab mask (maybe dynamic patched instead ?)
* r26 is the hidx mask
* r25 is the index in combo page
*/
- std r25,STK_REG(r25)(r1)
- std r26,STK_REG(r26)(r1)
- std r27,STK_REG(r27)(r1)
- std r28,STK_REG(r28)(r1)
- std r29,STK_REG(r29)(r1)
- std r30,STK_REG(r30)(r1)
- std r31,STK_REG(r31)(r1)
+ std r25,STK_REG(R25)(r1)
+ std r26,STK_REG(R26)(r1)
+ std r27,STK_REG(R27)(r1)
+ std r28,STK_REG(R28)(r1)
+ std r29,STK_REG(R29)(r1)
+ std r30,STK_REG(R30)(r1)
+ std r31,STK_REG(R31)(r1)
/* Step 1:
*
@@ -388,7 +392,7 @@ _GLOBAL(__hash_page_4K)
*/
rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
or r30,r30,r31
- ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+ ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
oris r30,r30,_PAGE_COMBO@h
/* Write the linux PTE atomically (setting busy) */
stdcx. r30,0,r6
@@ -407,26 +411,42 @@ _GLOBAL(__hash_page_4K)
BEGIN_FTR_SECTION
cmpdi r9,0 /* check segment size */
bne 3f
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
- /* Calc va and put it in r29 */
- rldicr r29,r5,28,63-28 /* r29 = (vsid << 28) */
- rldicl r3,r3,0,36 /* r3 = (ea & 0x0fffffff) */
- or r29,r3,r29 /* r29 = va */
-
- /* Calculate hash value for primary slot and store it in r28 */
- rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
- rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
- xor r28,r5,r0
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
+ /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT - VPN_SHIFT
+ /*
+ * clrldi r3,r3,64 - SID_SHIFT --> ea & 0xfffffff
+ * srdi r28,r3,VPN_SHIFT
+ */
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
+ or r29,r28,r29
+ /*
+ * Calculate hash value for primary slot and store it in r28
+ * r3 = va, r5 = vsid
+ * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
+ */
+ rldicl r0,r3,64-12,48
+ xor r28,r5,r0 /* hash */
b 4f
-3: /* Calc VA and hash in r29 and r28 for 1T segment */
- sldi r29,r5,40 /* vsid << 40 */
- clrldi r3,r3,24 /* ea & 0xffffffffff */
- rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */
- clrldi r5,r5,40 /* vsid & 0xffffff */
- rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */
- xor r28,r28,r5
- or r29,r3,r29 /* VA */
+3: /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
+ /*
+ * clrldi r3,r3,64 - SID_SHIFT_1T --> ea & 0xffffffffff
+ * srdi r28,r3,VPN_SHIFT
+ */
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
+ or r29,r28,r29
+
+ /*
+ * Calculate hash value for primary slot and
+ * store it in r28 for 1T segment
+ * r3 = va, r5 = vsid
+ */
+ sldi r28,r5,25 /* vsid << 25 */
+ /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
+ rldicl r0,r3,64-12,36
+ xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
xor r28,r28,r0 /* hash */
/* Convert linux PTE bits into HW equivalents */
@@ -444,7 +464,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
andc r0,r3,r0 /* r0 = pte & ~r0 */
rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ ori r3,r3,HPTE_R_C | HPTE_R_M
/* We eventually do the icache sync here (maybe inline that
* code rather than call a C function...)
@@ -452,13 +475,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
BEGIN_FTR_SECTION
mr r4,r30
mr r5,r7
- bl .hash_page_do_lazy_icache
+ bl hash_page_do_lazy_icache
END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* At this point, r3 contains new PP bits, save them in
* place of "access" in the param area (sic)
*/
- std r3,STK_PARM(r4)(r1)
+ std r3,STK_PARAM(R4)(r1)
/* Get htab_hash_mask */
ld r4,htab_hash_mask@got(2)
@@ -468,7 +491,7 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
* go to out-of-line code to try to modify the HPTE. We look for
* the bit at (1 >> (index + 32))
*/
- andi. r0,r31,_PAGE_HASHPTE
+ rldicl. r0,r31,64-12,48
li r26,0 /* Default hidx */
beq htab_insert_pte
@@ -479,8 +502,8 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
andis. r0,r31,_PAGE_COMBO@h
beq htab_inval_old_hpte
- ld r6,STK_PARM(r6)(r1)
- ori r26,r6,0x8000 /* Load the hidx mask */
+ ld r6,STK_PARAM(R6)(r1)
+ ori r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */
ld r26,0(r26)
addi r5,r25,36 /* Check actual HPTE_SUB bit, this */
rldcr. r0,r31,r5,0 /* must match pgtable.h definition */
@@ -501,12 +524,14 @@ htab_special_pfn:
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */
- mr r4,r29 /* Retreive va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge htab_pte_insert_ok /* Insertion successful */
@@ -528,12 +553,14 @@ _GLOBAL(htab_call_hpte_insert1)
rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */
- mr r4,r29 /* Retreive va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge+ htab_pte_insert_ok /* Insertion successful */
@@ -550,7 +577,8 @@ _GLOBAL(htab_call_hpte_insert2)
2: and r0,r5,r27
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
bl . /* patched by htab_finish_init() */
/* Try all again */
@@ -561,13 +589,17 @@ _GLOBAL(htab_call_hpte_remove)
* useless now that the segment has been switched to 4k pages.
*/
htab_inval_old_hpte:
- mr r3,r29 /* virtual addr */
+ mr r3,r29 /* vpn */
mr r4,r31 /* PTE.pte */
li r5,0 /* PTE.hidx */
li r6,MMU_PAGE_64K /* psize */
- ld r7,STK_PARM(r9)(r1) /* ssize */
- ld r8,STK_PARM(r8)(r1) /* local */
- bl .flush_hash_page
+ ld r7,STK_PARAM(R9)(r1) /* ssize */
+ ld r8,STK_PARAM(R8)(r1) /* local */
+ bl flush_hash_page
+ /* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */
+ lis r0,_PAGE_HPTE_SUB@h
+ ori r0,r0,_PAGE_HPTE_SUB@l
+ andc r30,r30,r0
b htab_insert_pte
htab_bail_ok:
@@ -578,7 +610,7 @@ htab_pte_insert_ok:
/* Insert slot number & secondary bit in PTE second half,
* clear _PAGE_BUSY and set approriate HPTE slot bit
*/
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
li r0,_PAGE_BUSY
andc r30,r30,r0
/* HPTE SUB bit */
@@ -593,19 +625,19 @@ htab_pte_insert_ok:
sld r4,r4,r5
andc r26,r26,r4
or r26,r26,r3
- ori r5,r6,0x8000
+ ori r5,r6,PTE_PAGE_HIDX_OFFSET
std r26,0(r5)
lwsync
std r30,0(r6)
li r3, 0
htab_bail:
- ld r25,STK_REG(r25)(r1)
- ld r26,STK_REG(r26)(r1)
- ld r27,STK_REG(r27)(r1)
- ld r28,STK_REG(r28)(r1)
- ld r29,STK_REG(r29)(r1)
- ld r30,STK_REG(r30)(r1)
- ld r31,STK_REG(r31)(r1)
+ ld r25,STK_REG(R25)(r1)
+ ld r26,STK_REG(R26)(r1)
+ ld r27,STK_REG(R27)(r1)
+ ld r28,STK_REG(R28)(r1)
+ ld r29,STK_REG(R29)(r1)
+ ld r30,STK_REG(R30)(r1)
+ ld r31,STK_REG(R31)(r1)
addi r1,r1,STACKFRAMESIZE
ld r0,16(r1)
mtlr r0
@@ -630,11 +662,13 @@ htab_modify_pte:
add r3,r0,r3 /* add slot idx */
/* Call ppc_md.hpte_updatepp */
- mr r5,r29 /* va */
- li r6,MMU_PAGE_4K /* page size */
- ld r7,STK_PARM(r9)(r1) /* segment size */
- ld r8,STK_PARM(r8)(r1) /* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+ mr r5,r29 /* vpn */
+ li r6,MMU_PAGE_4K /* base page size */
+ li r7,MMU_PAGE_4K /* actual page size */
+ ld r8,STK_PARAM(R9)(r1) /* segment size */
+ ld r9,STK_PARAM(R8)(r1) /* get "local" param */
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
bl . /* patched by htab_finish_init() */
/* if we failed because typically the HPTE wasn't really here
@@ -646,7 +680,7 @@ _GLOBAL(htab_call_hpte_updatepp)
/* Clear the BUSY bit and Write out the PTE */
li r0,_PAGE_BUSY
andc r30,r30,r0
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r30,0(r6)
li r3,0
b htab_bail
@@ -659,7 +693,7 @@ htab_wrong_access:
htab_pte_insert_failure:
/* Bail out restoring old PTE */
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r31,0(r6)
li r3,-1
b htab_bail
@@ -679,25 +713,22 @@ _GLOBAL(__hash_page_64K)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
/* Save all params that we need after a function call */
- std r6,STK_PARM(r6)(r1)
- std r8,STK_PARM(r8)(r1)
- std r9,STK_PARM(r9)(r1)
-
- /* Add _PAGE_PRESENT to access */
- ori r4,r4,_PAGE_PRESENT
+ std r6,STK_PARAM(R6)(r1)
+ std r8,STK_PARAM(R8)(r1)
+ std r9,STK_PARAM(R9)(r1)
/* Save non-volatile registers.
* r31 will hold "old PTE"
* r30 is "new PTE"
- * r29 is "va"
+ * r29 is vpn
* r28 is a hash value
* r27 is hashtab mask (maybe dynamic patched instead ?)
*/
- std r27,STK_REG(r27)(r1)
- std r28,STK_REG(r28)(r1)
- std r29,STK_REG(r29)(r1)
- std r30,STK_REG(r30)(r1)
- std r31,STK_REG(r31)(r1)
+ std r27,STK_REG(R27)(r1)
+ std r28,STK_REG(R28)(r1)
+ std r29,STK_REG(R29)(r1)
+ std r30,STK_REG(R30)(r1)
+ std r31,STK_REG(R31)(r1)
/* Step 1:
*
@@ -720,13 +751,13 @@ BEGIN_FTR_SECTION
andi. r0,r31,_PAGE_NO_CACHE
/* If so, bail out and refault as a 4k page */
bne- ht64_bail_ok
-END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_CI_LARGE_PAGE)
/* Prepare new PTE value (turn access RW into DIRTY, then
- * add BUSY,HASHPTE and ACCESSED)
+ * add BUSY and ACCESSED)
*/
rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
or r30,r30,r31
- ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+ ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
/* Write the linux PTE atomically (setting busy) */
stdcx. r30,0,r6
bne- 1b
@@ -741,26 +772,33 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
BEGIN_FTR_SECTION
cmpdi r9,0 /* check segment size */
bne 3f
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
- /* Calc va and put it in r29 */
- rldicr r29,r5,28,63-28
- rldicl r3,r3,0,36
- or r29,r3,r29
-
- /* Calculate hash value for primary slot and store it in r28 */
- rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
- rldicl r0,r3,64-16,52 /* (ea >> 16) & 0xfff */
- xor r28,r5,r0
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
+ /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT - VPN_SHIFT
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
+ or r29,r28,r29
+
+ /* Calculate hash value for primary slot and store it in r28
+ * r3 = va, r5 = vsid
+ * r0 = (va >> 16) & ((1ul << (28 - 16)) -1)
+ */
+ rldicl r0,r3,64-16,52
+ xor r28,r5,r0 /* hash */
b 4f
-3: /* Calc VA and hash in r29 and r28 for 1T segment */
- sldi r29,r5,40 /* vsid << 40 */
- clrldi r3,r3,24 /* ea & 0xffffffffff */
- rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */
- clrldi r5,r5,40 /* vsid & 0xffffff */
- rldicl r0,r3,64-16,40 /* (ea >> 16) & 0xffffff */
- xor r28,r28,r5
- or r29,r3,r29 /* VA */
+3: /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
+ or r29,r28,r29
+ /*
+ * calculate hash value for primary slot and
+ * store it in r28 for 1T segment
+ * r3 = va, r5 = vsid
+ */
+ sldi r28,r5,25 /* vsid << 25 */
+ /* r0 = (va >> 16) & ((1ul << (40 - 16)) -1) */
+ rldicl r0,r3,64-16,40
+ xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
xor r28,r28,r0 /* hash */
/* Convert linux PTE bits into HW equivalents */
@@ -771,7 +809,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
andc r0,r30,r0 /* r0 = pte & ~r0 */
rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ ori r3,r3,HPTE_R_C | HPTE_R_M
/* We eventually do the icache sync here (maybe inline that
* code rather than call a C function...)
@@ -779,13 +820,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
BEGIN_FTR_SECTION
mr r4,r30
mr r5,r7
- bl .hash_page_do_lazy_icache
+ bl hash_page_do_lazy_icache
END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* At this point, r3 contains new PP bits, save them in
* place of "access" in the param area (sic)
*/
- std r3,STK_PARM(r4)(r1)
+ std r3,STK_PARAM(R4)(r1)
/* Get htab_hash_mask */
ld r4,htab_hash_mask@got(2)
@@ -794,18 +835,21 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* Check if we may already be in the hashtable, in this case, we
* go to out-of-line code to try to modify the HPTE
*/
- andi. r0,r31,_PAGE_HASHPTE
+ rldicl. r0,r31,64-12,48
bne ht64_modify_pte
ht64_insert_pte:
/* Clear hpte bits in new pte (we also clear BUSY btw) and
- * add _PAGE_HASHPTE
+ * add _PAGE_HPTE_SUB0
*/
lis r0,_PAGE_HPTEFLAGS@h
ori r0,r0,_PAGE_HPTEFLAGS@l
andc r30,r30,r0
+#ifdef CONFIG_PPC_64K_PAGES
+ oris r30,r30,_PAGE_HPTE_SUB0@h
+#else
ori r30,r30,_PAGE_HASHPTE
-
+#endif
/* Phyical address in r5 */
rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
sldi r5,r5,PAGE_SHIFT
@@ -815,12 +859,14 @@ ht64_insert_pte:
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */
- mr r4,r29 /* Retreive va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_64K
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(ht64_call_hpte_insert1)
+ li r9,MMU_PAGE_64K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl ht64_call_hpte_insert1
+ht64_call_hpte_insert1:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge ht64_pte_insert_ok /* Insertion successful */
@@ -838,12 +884,14 @@ _GLOBAL(ht64_call_hpte_insert1)
rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */
- mr r4,r29 /* Retreive va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_64K
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(ht64_call_hpte_insert2)
+ li r9,MMU_PAGE_64K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl ht64_call_hpte_insert2
+ht64_call_hpte_insert2:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge+ ht64_pte_insert_ok /* Insertion successful */
@@ -860,7 +908,8 @@ _GLOBAL(ht64_call_hpte_insert2)
2: and r0,r5,r27
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_remove */
-_GLOBAL(ht64_call_hpte_remove)
+.globl ht64_call_hpte_remove
+ht64_call_hpte_remove:
bl . /* patched by htab_finish_init() */
/* Try all again */
@@ -878,15 +927,15 @@ ht64_pte_insert_ok:
* (maybe add eieio may be good still ?)
*/
ht64_write_out_pte:
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r30,0(r6)
li r3, 0
ht64_bail:
- ld r27,STK_REG(r27)(r1)
- ld r28,STK_REG(r28)(r1)
- ld r29,STK_REG(r29)(r1)
- ld r30,STK_REG(r30)(r1)
- ld r31,STK_REG(r31)(r1)
+ ld r27,STK_REG(R27)(r1)
+ ld r28,STK_REG(R28)(r1)
+ ld r29,STK_REG(R29)(r1)
+ ld r30,STK_REG(R30)(r1)
+ ld r31,STK_REG(R31)(r1)
addi r1,r1,STACKFRAMESIZE
ld r0,16(r1)
mtlr r0
@@ -909,11 +958,13 @@ ht64_modify_pte:
add r3,r0,r3 /* add slot idx */
/* Call ppc_md.hpte_updatepp */
- mr r5,r29 /* va */
- li r6,MMU_PAGE_64K
- ld r7,STK_PARM(r9)(r1) /* segment size */
- ld r8,STK_PARM(r8)(r1) /* get "local" param */
-_GLOBAL(ht64_call_hpte_updatepp)
+ mr r5,r29 /* vpn */
+ li r6,MMU_PAGE_64K /* base page size */
+ li r7,MMU_PAGE_64K /* actual page size */
+ ld r8,STK_PARAM(R9)(r1) /* segment size */
+ ld r9,STK_PARAM(R8)(r1) /* get "local" param */
+.globl ht64_call_hpte_updatepp
+ht64_call_hpte_updatepp:
bl . /* patched by htab_finish_init() */
/* if we failed because typically the HPTE wasn't really here
@@ -935,7 +986,7 @@ ht64_wrong_access:
ht64_pte_insert_failure:
/* Bail out restoring old PTE */
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r31,0(r6)
li r3,-1
b ht64_bail
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 34e5c0b219b..cf1d325eae8 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -14,10 +14,10 @@
#include <linux/spinlock.h>
#include <linux/bitops.h>
+#include <linux/of.h>
#include <linux/threads.h>
#include <linux/smp.h>
-#include <asm/abs_addr.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
@@ -27,6 +27,7 @@
#include <asm/cputable.h>
#include <asm/udbg.h>
#include <asm/kexec.h>
+#include <asm/ppc-opcode.h>
#ifdef DEBUG_LOW
#define DBG_LOW(fmt...) udbg_printf(fmt)
@@ -34,52 +35,110 @@
#define DBG_LOW(fmt...)
#endif
+#ifdef __BIG_ENDIAN__
#define HPTE_LOCK_BIT 3
+#else
+#define HPTE_LOCK_BIT (56+3)
+#endif
-static DEFINE_SPINLOCK(native_tlbie_lock);
+DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-static inline void __tlbie(unsigned long va, int psize, int ssize)
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
{
+ unsigned long va;
unsigned int penc;
+ unsigned long sllp;
- /* clear top 16 bits, non SLS segment */
+ /*
+ * We need 14 to 65 bits of va for a tlibe of 4K page
+ * With vpn we ignore the lower VPN_SHIFT bits already.
+ * And top two bits are already ignored because we can
+ * only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT
+ * of 12.
+ */
+ va = vpn << VPN_SHIFT;
+ /*
+ * clear top 16 bits of 64bit va, non SLS segment
+ * Older versions of the architecture (2.02 and earler) require the
+ * masking of the top 16 bits.
+ */
va &= ~(0xffffULL << 48);
switch (psize) {
case MMU_PAGE_4K:
- va &= ~0xffful;
+ /* clear out bits after (52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
- asm volatile("tlbie %0,0" : : "r" (va) : "memory");
+ sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+ ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+ va |= sllp << 5;
+ asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
+ : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+ : "memory");
break;
default:
- penc = mmu_psize_defs[psize].penc;
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ /* We need 14 to 14 + i bits of va */
+ penc = mmu_psize_defs[psize].penc[apsize];
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
- asm volatile("tlbie %0,1" : : "r" (va) : "memory");
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe); /* AVAL */
+ va |= 1; /* L */
+ asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
+ : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+ : "memory");
break;
}
}
-static inline void __tlbiel(unsigned long va, int psize, int ssize)
+static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
{
+ unsigned long va;
unsigned int penc;
+ unsigned long sllp;
- /* clear top 16 bits, non SLS segment */
+ /* VPN_SHIFT can be atmost 12 */
+ va = vpn << VPN_SHIFT;
+ /*
+ * clear top 16 bits of 64 bit va, non SLS segment
+ * Older versions of the architecture (2.02 and earler) require the
+ * masking of the top 16 bits.
+ */
va &= ~(0xffffULL << 48);
switch (psize) {
case MMU_PAGE_4K:
- va &= ~0xffful;
+ /* clear out bits after(52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
+ sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+ ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+ va |= sllp << 5;
asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
: : "r"(va) : "memory");
break;
default:
- penc = mmu_psize_defs[psize].penc;
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ /* We need 14 to 14 + i bits of va */
+ penc = mmu_psize_defs[psize].penc[apsize];
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe);
+ va |= 1; /* L */
asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
: : "r"(va) : "memory");
break;
@@ -87,33 +146,34 @@ static inline void __tlbiel(unsigned long va, int psize, int ssize)
}
-static inline void tlbie(unsigned long va, int psize, int ssize, int local)
+static inline void tlbie(unsigned long vpn, int psize, int apsize,
+ int ssize, int local)
{
- unsigned int use_local = local && cpu_has_feature(CPU_FTR_TLBIEL);
- int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+ unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL);
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
if (use_local)
use_local = mmu_psize_defs[psize].tlbiel;
if (lock_tlbie && !use_local)
- spin_lock(&native_tlbie_lock);
+ raw_spin_lock(&native_tlbie_lock);
asm volatile("ptesync": : :"memory");
if (use_local) {
- __tlbiel(va, psize, ssize);
+ __tlbiel(vpn, psize, apsize, ssize);
asm volatile("ptesync": : :"memory");
} else {
- __tlbie(va, psize, ssize);
+ __tlbie(vpn, psize, apsize, ssize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
if (lock_tlbie && !use_local)
- spin_unlock(&native_tlbie_lock);
+ raw_spin_unlock(&native_tlbie_lock);
}
static inline void native_lock_hpte(struct hash_pte *hptep)
{
- unsigned long *word = &hptep->v;
+ unsigned long *word = (unsigned long *)&hptep->v;
while (1) {
- if (!test_and_set_bit(HPTE_LOCK_BIT, word))
+ if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
break;
while(test_bit(HPTE_LOCK_BIT, word))
cpu_relax();
@@ -122,31 +182,30 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
static inline void native_unlock_hpte(struct hash_pte *hptep)
{
- unsigned long *word = &hptep->v;
+ unsigned long *word = (unsigned long *)&hptep->v;
- asm volatile("lwsync":::"memory");
- clear_bit(HPTE_LOCK_BIT, word);
+ clear_bit_unlock(HPTE_LOCK_BIT, word);
}
-static long native_hpte_insert(unsigned long hpte_group, unsigned long va,
+static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
unsigned long pa, unsigned long rflags,
- unsigned long vflags, int psize, int ssize)
+ unsigned long vflags, int psize, int apsize, int ssize)
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
int i;
if (!(vflags & HPTE_V_BOLTED)) {
- DBG_LOW(" insert(group=%lx, va=%016lx, pa=%016lx,"
+ DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
" rflags=%lx, vflags=%lx, psize=%d)\n",
- hpte_group, va, pa, rflags, vflags, psize);
+ hpte_group, vpn, pa, rflags, vflags, psize);
}
for (i = 0; i < HPTES_PER_GROUP; i++) {
- if (! (hptep->v & HPTE_V_VALID)) {
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
/* retry with lock held */
native_lock_hpte(hptep);
- if (! (hptep->v & HPTE_V_VALID))
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
break;
native_unlock_hpte(hptep);
}
@@ -157,22 +216,22 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long va,
if (i == HPTES_PER_GROUP)
return -1;
- hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID;
- hpte_r = hpte_encode_r(pa, psize) | rflags;
+ hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+ hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
i, hpte_v, hpte_r);
}
- hptep->r = hpte_r;
+ hptep->r = cpu_to_be64(hpte_r);
/* Guarantee the second dword is visible before the valid bit */
eieio();
/*
* Now set the first dword including the valid bit
* NOTE: this also unlocks the hpte
*/
- hptep->v = hpte_v;
+ hptep->v = cpu_to_be64(hpte_v);
__asm__ __volatile__ ("ptesync" : : : "memory");
@@ -193,12 +252,12 @@ static long native_hpte_remove(unsigned long hpte_group)
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + hpte_group + slot_offset;
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
/* retry with lock held */
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID)
&& !(hpte_v & HPTE_V_BOLTED))
break;
@@ -219,41 +278,46 @@ static long native_hpte_remove(unsigned long hpte_group)
}
static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
- unsigned long va, int psize, int ssize,
- int local)
+ unsigned long vpn, int bpsize,
+ int apsize, int ssize, int local)
{
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
int ret = 0;
- want_v = hpte_encode_v(va, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, bpsize, ssize);
- DBG_LOW(" update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)",
- va, want_v & HPTE_V_AVPN, slot, newpp);
+ DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
+ vpn, want_v & HPTE_V_AVPN, slot, newpp);
native_lock_hpte(hptep);
- hpte_v = hptep->v;
-
- /* Even if we miss, we need to invalidate the TLB */
+ hpte_v = be64_to_cpu(hptep->v);
+ /*
+ * We need to invalidate the TLB always because hpte_remove doesn't do
+ * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+ * random entry from it. When we do that we don't invalidate the TLB
+ * (hpte_remove) because we assume the old translation is still
+ * technically "valid".
+ */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
DBG_LOW(" -> miss\n");
ret = -1;
} else {
DBG_LOW(" -> hit\n");
/* Update the HPTE */
- hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
- (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & ~(HPTE_R_PP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)));
}
native_unlock_hpte(hptep);
/* Ensure it is out of the tlb too. */
- tlbie(va, psize, ssize, local);
+ tlbie(vpn, bpsize, apsize, ssize, local);
return ret;
}
-static long native_hpte_find(unsigned long va, int psize, int ssize)
+static long native_hpte_find(unsigned long vpn, int psize, int ssize)
{
struct hash_pte *hptep;
unsigned long hash;
@@ -261,14 +325,14 @@ static long native_hpte_find(unsigned long va, int psize, int ssize)
long slot;
unsigned long want_v, hpte_v;
- hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize);
- want_v = hpte_encode_v(va, psize, ssize);
+ hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
/* Bolted mappings are only ever in the primary group */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + slot;
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
/* HPTE matches */
@@ -289,28 +353,32 @@ static long native_hpte_find(unsigned long va, int psize, int ssize)
static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
int psize, int ssize)
{
- unsigned long vsid, va;
+ unsigned long vpn;
+ unsigned long vsid;
long slot;
struct hash_pte *hptep;
vsid = get_kernel_vsid(ea, ssize);
- va = hpt_va(ea, vsid, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
- slot = native_hpte_find(va, psize, ssize);
+ slot = native_hpte_find(vpn, psize, ssize);
if (slot == -1)
panic("could not find page to bolt\n");
hptep = htab_address + slot;
/* Update the HPTE */
- hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
- (newpp & (HPTE_R_PP | HPTE_R_N));
-
- /* Ensure it is out of the tlb too. */
- tlbie(va, psize, ssize, 0);
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+ ~(HPTE_R_PP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PP | HPTE_R_N)));
+ /*
+ * Ensure it is out of the tlb too. Bolted entries base and
+ * actual page size will be same.
+ */
+ tlbie(vpn, psize, psize, ssize, 0);
}
-static void native_hpte_invalidate(unsigned long slot, unsigned long va,
- int psize, int ssize, int local)
+static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
+ int bpsize, int apsize, int ssize, int local)
{
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v;
@@ -319,13 +387,19 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va,
local_irq_save(flags);
- DBG_LOW(" invalidate(va=%016lx, hash: %x)\n", va, slot);
+ DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
- want_v = hpte_encode_v(va, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, bpsize, ssize);
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
- /* Even if we miss, we need to invalidate the TLB */
+ /*
+ * We need to invalidate the TLB always because hpte_remove doesn't do
+ * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+ * random entry from it. When we do that we don't invalidate the TLB
+ * (hpte_remove) because we assume the old translation is still
+ * technically "valid".
+ */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
else
@@ -333,73 +407,175 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va,
hptep->v = 0;
/* Invalidate the TLB */
- tlbie(va, psize, ssize, local);
+ tlbie(vpn, bpsize, apsize, ssize, local);
+
+ local_irq_restore(flags);
+}
+
+static void native_hugepage_invalidate(struct mm_struct *mm,
+ unsigned char *hpte_slot_array,
+ unsigned long addr, int psize)
+{
+ int ssize = 0, i;
+ int lock_tlbie;
+ struct hash_pte *hptep;
+ int actual_psize = MMU_PAGE_16M;
+ unsigned int max_hpte_count, valid;
+ unsigned long flags, s_addr = addr;
+ unsigned long hpte_v, want_v, shift;
+ unsigned long hidx, vpn = 0, vsid, hash, slot;
+
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+ local_irq_save(flags);
+ for (i = 0; i < max_hpte_count; i++) {
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_vsid(mm->context.id, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ hptep = htab_address + slot;
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+ native_lock_hpte(hptep);
+ hpte_v = be64_to_cpu(hptep->v);
+
+ /* Even if we miss, we need to invalidate the TLB */
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+ native_unlock_hpte(hptep);
+ else
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ hptep->v = 0;
+ }
+ /*
+ * Since this is a hugepage, we just need a single tlbie.
+ * use the last vpn.
+ */
+ lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+ if (lock_tlbie)
+ raw_spin_lock(&native_tlbie_lock);
+
+ asm volatile("ptesync":::"memory");
+ __tlbie(vpn, psize, actual_psize, ssize);
+ asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+ if (lock_tlbie)
+ raw_spin_unlock(&native_tlbie_lock);
local_irq_restore(flags);
}
-#define LP_SHIFT 12
-#define LP_BITS 8
-#define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT)
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+ int i, shift;
+ unsigned int mask;
+
+ /* start from 1 ignoring MMU_PAGE_4K */
+ for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+ /* invalid penc */
+ if (mmu_psize_defs[psize].penc[i] == -1)
+ continue;
+ /*
+ * encoding bits per actual page size
+ * PTE LP actual page size
+ * rrrr rrrz >=8KB
+ * rrrr rrzz >=16KB
+ * rrrr rzzz >=32KB
+ * rrrr zzzz >=64KB
+ * .......
+ */
+ shift = mmu_psize_defs[i].shift - LP_SHIFT;
+ if (shift > LP_BITS)
+ shift = LP_BITS;
+ mask = (1 << shift) - 1;
+ if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+ return i;
+ }
+ return -1;
+}
static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
- int *psize, int *ssize, unsigned long *va)
+ int *psize, int *apsize, int *ssize, unsigned long *vpn)
{
- unsigned long hpte_r = hpte->r;
- unsigned long hpte_v = hpte->v;
- unsigned long avpn;
- int i, size, shift, penc;
-
- if (!(hpte_v & HPTE_V_LARGE))
- size = MMU_PAGE_4K;
- else {
- for (i = 0; i < LP_BITS; i++) {
- if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
- break;
- }
- penc = LP_MASK(i+1) >> LP_SHIFT;
+ unsigned long avpn, pteg, vpi;
+ unsigned long hpte_v = be64_to_cpu(hpte->v);
+ unsigned long hpte_r = be64_to_cpu(hpte->r);
+ unsigned long vsid, seg_off;
+ int size, a_size, shift;
+ /* Look at the 8 bit LP value */
+ unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+ if (!(hpte_v & HPTE_V_LARGE)) {
+ size = MMU_PAGE_4K;
+ a_size = MMU_PAGE_4K;
+ } else {
for (size = 0; size < MMU_PAGE_COUNT; size++) {
- /* 4K pages are not represented by LP */
- if (size == MMU_PAGE_4K)
- continue;
-
/* valid entries have a shift value */
if (!mmu_psize_defs[size].shift)
continue;
- if (penc == mmu_psize_defs[size].penc)
+ a_size = __hpte_actual_psize(lp, size);
+ if (a_size != -1)
break;
}
}
-
/* This works for all page sizes, and for 256M and 1T segments */
+ *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
shift = mmu_psize_defs[size].shift;
- avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm) << 23;
- if (shift < 23) {
- unsigned long vpi, vsid, pteg;
-
- pteg = slot / HPTES_PER_GROUP;
- if (hpte_v & HPTE_V_SECONDARY)
- pteg = ~pteg;
- switch (hpte_v >> HPTE_V_SSIZE_SHIFT) {
- case MMU_SEGSIZE_256M:
- vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
- break;
- case MMU_SEGSIZE_1T:
- vsid = avpn >> 40;
+ avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
+ pteg = slot / HPTES_PER_GROUP;
+ if (hpte_v & HPTE_V_SECONDARY)
+ pteg = ~pteg;
+
+ switch (*ssize) {
+ case MMU_SEGSIZE_256M:
+ /* We only have 28 - 23 bits of seg_off in avpn */
+ seg_off = (avpn & 0x1f) << 23;
+ vsid = avpn >> 5;
+ /* We can find more bits from the pteg value */
+ if (shift < 23) {
+ vpi = (vsid ^ pteg) & htab_hash_mask;
+ seg_off |= vpi << shift;
+ }
+ *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
+ case MMU_SEGSIZE_1T:
+ /* We only have 40 - 23 bits of seg_off in avpn */
+ seg_off = (avpn & 0x1ffff) << 23;
+ vsid = avpn >> 17;
+ if (shift < 23) {
vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
- break;
- default:
- avpn = vpi = size = 0;
+ seg_off |= vpi << shift;
}
- avpn |= (vpi << mmu_psize_defs[size].shift);
+ *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
+ default:
+ *vpn = size = 0;
}
-
- *va = avpn;
- *psize = size;
- *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
+ *psize = size;
+ *apsize = a_size;
}
/*
@@ -412,11 +588,12 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
*/
static void native_hpte_clear(void)
{
+ unsigned long vpn = 0;
unsigned long slot, slots, flags;
struct hash_pte *hptep = htab_address;
- unsigned long hpte_v, va;
+ unsigned long hpte_v;
unsigned long pteg_count;
- int psize, ssize;
+ int psize, apsize, ssize;
pteg_count = htab_hash_mask + 1;
@@ -425,7 +602,7 @@ static void native_hpte_clear(void)
/* we take the tlbie lock and hold it. Some hardware will
* deadlock if we try to tlbie from two processors at once.
*/
- spin_lock(&native_tlbie_lock);
+ raw_spin_lock(&native_tlbie_lock);
slots = pteg_count * HPTES_PER_GROUP;
@@ -435,21 +612,21 @@ static void native_hpte_clear(void)
* running, right? and for crash dump, we probably
* don't want to wait for a maybe bad cpu.
*/
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
/*
* Call __tlbie() here rather than tlbie() since we
* already hold the native_tlbie_lock.
*/
if (hpte_v & HPTE_V_VALID) {
- hpte_decode(hptep, slot, &psize, &ssize, &va);
+ hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
hptep->v = 0;
- __tlbie(va, psize, ssize);
+ __tlbie(vpn, psize, apsize, ssize);
}
}
asm volatile("eieio; tlbsync; ptesync":::"memory");
- spin_unlock(&native_tlbie_lock);
+ raw_spin_unlock(&native_tlbie_lock);
local_irq_restore(flags);
}
@@ -459,7 +636,8 @@ static void native_hpte_clear(void)
*/
static void native_flush_hash_range(unsigned long number, int local)
{
- unsigned long va, hash, index, hidx, shift, slot;
+ unsigned long vpn;
+ unsigned long hash, index, hidx, shift, slot;
struct hash_pte *hptep;
unsigned long hpte_v;
unsigned long want_v;
@@ -473,20 +651,20 @@ static void native_flush_hash_range(unsigned long number, int local)
local_irq_save(flags);
for (i = 0; i < number; i++) {
- va = batch->vaddr[i];
+ vpn = batch->vpn[i];
pte = batch->pte[i];
- pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
- hash = hpt_hash(va, shift, ssize);
+ pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+ hash = hpt_hash(vpn, shift, ssize);
hidx = __rpte_to_hidx(pte, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
hptep = htab_address + slot;
- want_v = hpte_encode_v(va, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if (!HPTE_V_COMPARE(hpte_v, want_v) ||
!(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
@@ -495,67 +673,44 @@ static void native_flush_hash_range(unsigned long number, int local)
} pte_iterate_hashed_end();
}
- if (cpu_has_feature(CPU_FTR_TLBIEL) &&
+ if (mmu_has_feature(MMU_FTR_TLBIEL) &&
mmu_psize_defs[psize].tlbiel && local) {
asm volatile("ptesync":::"memory");
for (i = 0; i < number; i++) {
- va = batch->vaddr[i];
+ vpn = batch->vpn[i];
pte = batch->pte[i];
- pte_iterate_hashed_subpages(pte, psize, va, index,
- shift) {
- __tlbiel(va, psize, ssize);
+ pte_iterate_hashed_subpages(pte, psize,
+ vpn, index, shift) {
+ __tlbiel(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("ptesync":::"memory");
} else {
- int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
if (lock_tlbie)
- spin_lock(&native_tlbie_lock);
+ raw_spin_lock(&native_tlbie_lock);
asm volatile("ptesync":::"memory");
for (i = 0; i < number; i++) {
- va = batch->vaddr[i];
+ vpn = batch->vpn[i];
pte = batch->pte[i];
- pte_iterate_hashed_subpages(pte, psize, va, index,
- shift) {
- __tlbie(va, psize, ssize);
+ pte_iterate_hashed_subpages(pte, psize,
+ vpn, index, shift) {
+ __tlbie(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("eieio; tlbsync; ptesync":::"memory");
if (lock_tlbie)
- spin_unlock(&native_tlbie_lock);
+ raw_spin_unlock(&native_tlbie_lock);
}
local_irq_restore(flags);
}
-#ifdef CONFIG_PPC_PSERIES
-/* Disable TLB batching on nighthawk */
-static inline int tlb_batching_enabled(void)
-{
- struct device_node *root = of_find_node_by_path("/");
- int enabled = 1;
-
- if (root) {
- const char *model = of_get_property(root, "model", NULL);
- if (model && !strcmp(model, "IBM,9076-N81"))
- enabled = 0;
- of_node_put(root);
- }
-
- return enabled;
-}
-#else
-static inline int tlb_batching_enabled(void)
-{
- return 1;
-}
-#endif
-
void __init hpte_init_native(void)
{
ppc_md.hpte_invalidate = native_hpte_invalidate;
@@ -564,6 +719,6 @@ void __init hpte_init_native(void)
ppc_md.hpte_insert = native_hpte_insert;
ppc_md.hpte_remove = native_hpte_remove;
ppc_md.hpte_clear_all = native_hpte_clear;
- if (tlb_batching_enabled())
- ppc_md.flush_hash_range = native_flush_hash_range;
+ ppc_md.flush_hash_range = native_flush_hash_range;
+ ppc_md.hugepage_invalidate = native_hugepage_invalidate;
}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index a83dfa3cf40..88fdd9d2507 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -27,10 +27,13 @@
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/sysctl.h>
+#include <linux/export.h>
#include <linux/ctype.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/signal.h>
+#include <linux/memblock.h>
+#include <linux/context_tracking.h>
#include <asm/processor.h>
#include <asm/pgtable.h>
@@ -38,11 +41,9 @@
#include <asm/mmu_context.h>
#include <asm/page.h>
#include <asm/types.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/machdep.h>
-#include <asm/lmb.h>
-#include <asm/abs_addr.h>
+#include <asm/prom.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
#include <asm/eeh.h>
@@ -52,6 +53,10 @@
#include <asm/sections.h>
#include <asm/spu.h>
#include <asm/udbg.h>
+#include <asm/code-patching.h>
+#include <asm/fadump.h>
+#include <asm/firmware.h>
+#include <asm/tm.h>
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
@@ -67,6 +72,7 @@
#define KB (1024)
#define MB (1024*KB)
+#define GB (1024L*MB)
/*
* Note: pte --> Linux PTE
@@ -90,17 +96,18 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
struct hash_pte *htab_address;
unsigned long htab_size_bytes;
unsigned long htab_hash_mask;
+EXPORT_SYMBOL_GPL(htab_hash_mask);
int mmu_linear_psize = MMU_PAGE_4K;
int mmu_virtual_psize = MMU_PAGE_4K;
int mmu_vmalloc_psize = MMU_PAGE_4K;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif
int mmu_io_psize = MMU_PAGE_4K;
int mmu_kernel_ssize = MMU_SEGSIZE_256M;
int mmu_highuser_ssize = MMU_SEGSIZE_256M;
u16 mmu_slb_size = 64;
-#ifdef CONFIG_HUGETLB_PAGE
-int mmu_huge_psize = MMU_PAGE_16M;
-unsigned int HPAGE_SHIFT;
-#endif
+EXPORT_SYMBOL_GPL(mmu_slb_size);
#ifdef CONFIG_PPC_64K_PAGES
int mmu_ci_restrictions;
#endif
@@ -116,11 +123,11 @@ static DEFINE_SPINLOCK(linear_map_hash_lock);
/* Pre-POWER4 CPUs (4k pages only)
*/
-struct mmu_psize_def mmu_psize_defaults_old[] = {
+static struct mmu_psize_def mmu_psize_defaults_old[] = {
[MMU_PAGE_4K] = {
.shift = 12,
.sllp = 0,
- .penc = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
.avpnm = 0,
.tlbiel = 0,
},
@@ -130,56 +137,100 @@ struct mmu_psize_def mmu_psize_defaults_old[] = {
*
* Support for 16Mb large pages
*/
-struct mmu_psize_def mmu_psize_defaults_gp[] = {
+static struct mmu_psize_def mmu_psize_defaults_gp[] = {
[MMU_PAGE_4K] = {
.shift = 12,
.sllp = 0,
- .penc = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
.avpnm = 0,
.tlbiel = 1,
},
[MMU_PAGE_16M] = {
.shift = 24,
.sllp = SLB_VSID_L,
- .penc = 0,
+ .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
+ [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
.avpnm = 0x1UL,
.tlbiel = 0,
},
};
+static unsigned long htab_convert_pte_flags(unsigned long pteflags)
+{
+ unsigned long rflags = pteflags & 0x1fa;
+
+ /* _PAGE_EXEC -> NOEXEC */
+ if ((pteflags & _PAGE_EXEC) == 0)
+ rflags |= HPTE_R_N;
+
+ /* PP bits. PAGE_USER is already PP bit 0x2, so we only
+ * need to add in 0x1 if it's a read-only user page
+ */
+ if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
+ (pteflags & _PAGE_DIRTY)))
+ rflags |= 1;
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ return rflags | HPTE_R_C | HPTE_R_M;
+}
int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
- unsigned long pstart, unsigned long mode,
+ unsigned long pstart, unsigned long prot,
int psize, int ssize)
{
unsigned long vaddr, paddr;
unsigned int step, shift;
- unsigned long tmp_mode;
int ret = 0;
shift = mmu_psize_defs[psize].shift;
step = 1 << shift;
+ prot = htab_convert_pte_flags(prot);
+
+ DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+ vstart, vend, pstart, prot, psize, ssize);
+
for (vaddr = vstart, paddr = pstart; vaddr < vend;
vaddr += step, paddr += step) {
unsigned long hash, hpteg;
unsigned long vsid = get_kernel_vsid(vaddr, ssize);
- unsigned long va = hpt_va(vaddr, vsid, ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
+ unsigned long tprot = prot;
- tmp_mode = mode;
-
- /* Make non-kernel text non-executable */
- if (!in_kernel_text(vaddr))
- tmp_mode = mode | HPTE_R_N;
+ /*
+ * If we hit a bad address return error.
+ */
+ if (!vsid)
+ return -1;
+ /* Make kernel text executable */
+ if (overlaps_kernel_text(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ /* Make kvm guest trampolines executable */
+ if (overlaps_kvm_tmp(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ /*
+ * If relocatable, check if it overlaps interrupt vectors that
+ * are copied down to real 0. For relocatable kernel
+ * (e.g. kdump case) we copy interrupt vectors down to real
+ * address 0. Mark that region as executable. This is
+ * because on p8 system with relocation on exception feature
+ * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+ * in order to execute the interrupt handlers in virtual
+ * mode the vector region need to be marked as executable.
+ */
+ if ((PHYSICAL_START > MEMORY_START) &&
+ overlaps_interrupt_vector_text(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
- hash = hpt_hash(va, shift, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
- DBG("htab_bolt_mapping: calling %p\n", ppc_md.hpte_insert);
-
BUG_ON(!ppc_md.hpte_insert);
- ret = ppc_md.hpte_insert(hpteg, va, paddr,
- tmp_mode, HPTE_V_BOLTED, psize, ssize);
+ ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot,
+ HPTE_V_BOLTED, psize, psize, ssize);
if (ret < 0)
break;
@@ -191,30 +242,52 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
return ret < 0 ? ret : 0;
}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int htab_remove_mapping(unsigned long vstart, unsigned long vend,
+ int psize, int ssize)
+{
+ unsigned long vaddr;
+ unsigned int step, shift;
+
+ shift = mmu_psize_defs[psize].shift;
+ step = 1 << shift;
+
+ if (!ppc_md.hpte_removebolted) {
+ printk(KERN_WARNING "Platform doesn't implement "
+ "hpte_removebolted\n");
+ return -EINVAL;
+ }
+
+ for (vaddr = vstart; vaddr < vend; vaddr += step)
+ ppc_md.hpte_removebolted(vaddr, psize, ssize);
+
+ return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
static int __init htab_dt_scan_seg_sizes(unsigned long node,
const char *uname, int depth,
void *data)
{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
- unsigned long size = 0;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes",
- &size);
+ prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
if (prop == NULL)
return 0;
for (; size >= 4; size -= 4, ++prop) {
- if (prop[0] == 40) {
+ if (be32_to_cpu(prop[0]) == 40) {
DBG("1T segment support detected\n");
- cur_cpu_spec->cpu_features |= CPU_FTR_1T_SEGMENT;
+ cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
return 1;
}
}
- cur_cpu_spec->cpu_features &= ~CPU_FTR_NO_SLBIE_B;
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
return 0;
}
@@ -223,89 +296,184 @@ static void __init htab_init_seg_sizes(void)
of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
}
+static int __init get_idx_from_shift(unsigned int shift)
+{
+ int idx = -1;
+
+ switch (shift) {
+ case 0xc:
+ idx = MMU_PAGE_4K;
+ break;
+ case 0x10:
+ idx = MMU_PAGE_64K;
+ break;
+ case 0x14:
+ idx = MMU_PAGE_1M;
+ break;
+ case 0x18:
+ idx = MMU_PAGE_16M;
+ break;
+ case 0x22:
+ idx = MMU_PAGE_16G;
+ break;
+ }
+ return idx;
+}
+
static int __init htab_dt_scan_page_sizes(unsigned long node,
const char *uname, int depth,
void *data)
{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
- unsigned long size = 0;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- prop = (u32 *)of_get_flat_dt_prop(node,
- "ibm,segment-page-sizes", &size);
+ prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
if (prop != NULL) {
- DBG("Page sizes from device-tree:\n");
+ pr_info("Page sizes from device-tree:\n");
size /= 4;
- cur_cpu_spec->cpu_features &= ~(CPU_FTR_16M_PAGE);
+ cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
while(size > 0) {
- unsigned int shift = prop[0];
- unsigned int slbenc = prop[1];
- unsigned int lpnum = prop[2];
- unsigned int lpenc = 0;
+ unsigned int base_shift = be32_to_cpu(prop[0]);
+ unsigned int slbenc = be32_to_cpu(prop[1]);
+ unsigned int lpnum = be32_to_cpu(prop[2]);
struct mmu_psize_def *def;
- int idx = -1;
+ int idx, base_idx;
size -= 3; prop += 3;
- while(size > 0 && lpnum) {
- if (prop[0] == shift)
- lpenc = prop[1];
- prop += 2; size -= 2;
- lpnum--;
- }
- switch(shift) {
- case 0xc:
- idx = MMU_PAGE_4K;
- break;
- case 0x10:
- idx = MMU_PAGE_64K;
- break;
- case 0x14:
- idx = MMU_PAGE_1M;
- break;
- case 0x18:
- idx = MMU_PAGE_16M;
- cur_cpu_spec->cpu_features |= CPU_FTR_16M_PAGE;
- break;
- case 0x22:
- idx = MMU_PAGE_16G;
- break;
- }
- if (idx < 0)
+ base_idx = get_idx_from_shift(base_shift);
+ if (base_idx < 0) {
+ /*
+ * skip the pte encoding also
+ */
+ prop += lpnum * 2; size -= lpnum * 2;
continue;
- def = &mmu_psize_defs[idx];
- def->shift = shift;
- if (shift <= 23)
+ }
+ def = &mmu_psize_defs[base_idx];
+ if (base_idx == MMU_PAGE_16M)
+ cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
+
+ def->shift = base_shift;
+ if (base_shift <= 23)
def->avpnm = 0;
else
- def->avpnm = (1 << (shift - 23)) - 1;
+ def->avpnm = (1 << (base_shift - 23)) - 1;
def->sllp = slbenc;
- def->penc = lpenc;
- /* We don't know for sure what's up with tlbiel, so
+ /*
+ * We don't know for sure what's up with tlbiel, so
* for now we only set it for 4K and 64K pages
*/
- if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K)
+ if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
def->tlbiel = 1;
else
def->tlbiel = 0;
- DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, "
- "tlbiel=%d, penc=%d\n",
- idx, shift, def->sllp, def->avpnm, def->tlbiel,
- def->penc);
+ while (size > 0 && lpnum) {
+ unsigned int shift = be32_to_cpu(prop[0]);
+ int penc = be32_to_cpu(prop[1]);
+
+ prop += 2; size -= 2;
+ lpnum--;
+
+ idx = get_idx_from_shift(shift);
+ if (idx < 0)
+ continue;
+
+ if (penc == -1)
+ pr_err("Invalid penc for base_shift=%d "
+ "shift=%d\n", base_shift, shift);
+
+ def->penc[idx] = penc;
+ pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
+ " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
+ base_shift, shift, def->sllp,
+ def->avpnm, def->tlbiel, def->penc[idx]);
+ }
}
return 1;
}
return 0;
}
+#ifdef CONFIG_HUGETLB_PAGE
+/* Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+ const char *uname, int depth,
+ void *data) {
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be64 *addr_prop;
+ const __be32 *page_count_prop;
+ unsigned int expected_pages;
+ long unsigned int phys_addr;
+ long unsigned int block_size;
+
+ /* We are scanning "memory" nodes only */
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ /* This property is the log base 2 of the number of virtual pages that
+ * will represent this memory block. */
+ page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+ if (page_count_prop == NULL)
+ return 0;
+ expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
+ addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+ if (addr_prop == NULL)
+ return 0;
+ phys_addr = be64_to_cpu(addr_prop[0]);
+ block_size = be64_to_cpu(addr_prop[1]);
+ if (block_size != (16 * GB))
+ return 0;
+ printk(KERN_INFO "Huge page(16GB) memory: "
+ "addr = 0x%lX size = 0x%lX pages = %d\n",
+ phys_addr, block_size, expected_pages);
+ if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
+ memblock_reserve(phys_addr, block_size * expected_pages);
+ add_gpage(phys_addr, block_size, expected_pages);
+ }
+ return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static void mmu_psize_set_default_penc(void)
+{
+ int bpsize, apsize;
+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+ for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
+ mmu_psize_defs[bpsize].penc[apsize] = -1;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+
+static bool might_have_hea(void)
+{
+ /*
+ * The HEA ethernet adapter requires awareness of the
+ * GX bus. Without that awareness we can easily assume
+ * we will never see an HEA ethernet device.
+ */
+#ifdef CONFIG_IBMEBUS
+ return !cpu_has_feature(CPU_FTR_ARCH_207S);
+#else
+ return false;
+#endif
+}
+
+#endif /* #ifdef CONFIG_PPC_64K_PAGES */
+
static void __init htab_init_page_sizes(void)
{
int rc;
+ /* se the invalid penc to -1 */
+ mmu_psize_set_default_penc();
+
/* Default to 4K pages only */
memcpy(mmu_psize_defs, mmu_psize_defaults_old,
sizeof(mmu_psize_defaults_old));
@@ -321,7 +489,7 @@ static void __init htab_init_page_sizes(void)
* Not in the device-tree, let's fallback on known size
* list for 16M capable GP & GR
*/
- if (cpu_has_feature(CPU_FTR_16M_PAGE))
+ if (mmu_has_feature(MMU_FTR_16M_PAGE))
memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
sizeof(mmu_psize_defaults_gp));
found:
@@ -351,34 +519,49 @@ static void __init htab_init_page_sizes(void)
mmu_vmalloc_psize = MMU_PAGE_64K;
if (mmu_linear_psize == MMU_PAGE_4K)
mmu_linear_psize = MMU_PAGE_64K;
- if (cpu_has_feature(CPU_FTR_CI_LARGE_PAGE)) {
+ if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
/*
- * Don't use 64k pages for ioremap on pSeries, since
- * that would stop us accessing the HEA ethernet.
+ * When running on pSeries using 64k pages for ioremap
+ * would stop us accessing the HEA ethernet. So if we
+ * have the chance of ever seeing one, stay at 4k.
*/
- if (!machine_is(pseries))
+ if (!might_have_hea() || !machine_is(pseries))
mmu_io_psize = MMU_PAGE_64K;
} else
mmu_ci_restrictions = 1;
}
#endif /* CONFIG_PPC_64K_PAGES */
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ /* We try to use 16M pages for vmemmap if that is supported
+ * and we have at least 1G of RAM at boot
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift &&
+ memblock_phys_mem_size() >= 0x40000000)
+ mmu_vmemmap_psize = MMU_PAGE_16M;
+ else if (mmu_psize_defs[MMU_PAGE_64K].shift)
+ mmu_vmemmap_psize = MMU_PAGE_64K;
+ else
+ mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
printk(KERN_DEBUG "Page orders: linear mapping = %d, "
- "virtual = %d, io = %d\n",
+ "virtual = %d, io = %d"
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ ", vmemmap = %d"
+#endif
+ "\n",
mmu_psize_defs[mmu_linear_psize].shift,
mmu_psize_defs[mmu_virtual_psize].shift,
- mmu_psize_defs[mmu_io_psize].shift);
+ mmu_psize_defs[mmu_io_psize].shift
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ ,mmu_psize_defs[mmu_vmemmap_psize].shift
+#endif
+ );
#ifdef CONFIG_HUGETLB_PAGE
- /* Init large page size. Currently, we pick 16M or 1M depending
- * on what is available
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift)
- set_huge_psize(MMU_PAGE_16M);
- /* With 4k/4level pagetables, we can't (for now) cope with a
- * huge page size < PMD_SIZE */
- else if (mmu_psize_defs[MMU_PAGE_1M].shift)
- set_huge_psize(MMU_PAGE_1M);
+ /* Reserve 16G huge page memory sections for huge pages */
+ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
#endif /* CONFIG_HUGETLB_PAGE */
}
@@ -386,17 +569,17 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
const char *uname, int depth,
void *data)
{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+ prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
if (prop != NULL) {
/* pft_size[0] is the NUMA CEC cookie */
- ppc64_pft_size = prop[1];
+ ppc64_pft_size = be32_to_cpu(prop[1]);
return 1;
}
return 0;
@@ -404,7 +587,7 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
static unsigned long __init htab_get_table_size(void)
{
- unsigned long mem_size, rnd_mem_size, pteg_count;
+ unsigned long mem_size, rnd_mem_size, pteg_count, psize;
/* If hash size isn't already provided by the platform, we try to
* retrieve it from the device-tree. If it's not there neither, we
@@ -416,70 +599,80 @@ static unsigned long __init htab_get_table_size(void)
return 1UL << ppc64_pft_size;
/* round mem_size up to next power of 2 */
- mem_size = lmb_phys_mem_size();
+ mem_size = memblock_phys_mem_size();
rnd_mem_size = 1UL << __ilog2(mem_size);
if (rnd_mem_size < mem_size)
rnd_mem_size <<= 1;
/* # pages / 2 */
- pteg_count = max(rnd_mem_size >> (12 + 1), 1UL << 11);
+ psize = mmu_psize_defs[mmu_virtual_psize].shift;
+ pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11);
return pteg_count << 7;
}
#ifdef CONFIG_MEMORY_HOTPLUG
-void create_section_mapping(unsigned long start, unsigned long end)
+int create_section_mapping(unsigned long start, unsigned long end)
{
- BUG_ON(htab_bolt_mapping(start, end, __pa(start),
- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX,
- mmu_linear_psize, mmu_kernel_ssize));
+ return htab_bolt_mapping(start, end, __pa(start),
+ pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+ mmu_kernel_ssize);
}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-static inline void make_bl(unsigned int *insn_addr, void *func)
+int remove_section_mapping(unsigned long start, unsigned long end)
{
- unsigned long funcp = *((unsigned long *)func);
- int offset = funcp - (unsigned long)insn_addr;
-
- *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
- flush_icache_range((unsigned long)insn_addr, 4+
- (unsigned long)insn_addr);
+ return htab_remove_mapping(start, end, mmu_linear_psize,
+ mmu_kernel_ssize);
}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+extern u32 htab_call_hpte_insert1[];
+extern u32 htab_call_hpte_insert2[];
+extern u32 htab_call_hpte_remove[];
+extern u32 htab_call_hpte_updatepp[];
+extern u32 ht64_call_hpte_insert1[];
+extern u32 ht64_call_hpte_insert2[];
+extern u32 ht64_call_hpte_remove[];
+extern u32 ht64_call_hpte_updatepp[];
static void __init htab_finish_init(void)
{
- extern unsigned int *htab_call_hpte_insert1;
- extern unsigned int *htab_call_hpte_insert2;
- extern unsigned int *htab_call_hpte_remove;
- extern unsigned int *htab_call_hpte_updatepp;
-
#ifdef CONFIG_PPC_HAS_HASH_64K
- extern unsigned int *ht64_call_hpte_insert1;
- extern unsigned int *ht64_call_hpte_insert2;
- extern unsigned int *ht64_call_hpte_remove;
- extern unsigned int *ht64_call_hpte_updatepp;
-
- make_bl(ht64_call_hpte_insert1, ppc_md.hpte_insert);
- make_bl(ht64_call_hpte_insert2, ppc_md.hpte_insert);
- make_bl(ht64_call_hpte_remove, ppc_md.hpte_remove);
- make_bl(ht64_call_hpte_updatepp, ppc_md.hpte_updatepp);
+ patch_branch(ht64_call_hpte_insert1,
+ ppc_function_entry(ppc_md.hpte_insert),
+ BRANCH_SET_LINK);
+ patch_branch(ht64_call_hpte_insert2,
+ ppc_function_entry(ppc_md.hpte_insert),
+ BRANCH_SET_LINK);
+ patch_branch(ht64_call_hpte_remove,
+ ppc_function_entry(ppc_md.hpte_remove),
+ BRANCH_SET_LINK);
+ patch_branch(ht64_call_hpte_updatepp,
+ ppc_function_entry(ppc_md.hpte_updatepp),
+ BRANCH_SET_LINK);
#endif /* CONFIG_PPC_HAS_HASH_64K */
- make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
- make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
- make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
- make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
+ patch_branch(htab_call_hpte_insert1,
+ ppc_function_entry(ppc_md.hpte_insert),
+ BRANCH_SET_LINK);
+ patch_branch(htab_call_hpte_insert2,
+ ppc_function_entry(ppc_md.hpte_insert),
+ BRANCH_SET_LINK);
+ patch_branch(htab_call_hpte_remove,
+ ppc_function_entry(ppc_md.hpte_remove),
+ BRANCH_SET_LINK);
+ patch_branch(htab_call_hpte_updatepp,
+ ppc_function_entry(ppc_md.hpte_updatepp),
+ BRANCH_SET_LINK);
}
-void __init htab_initialize(void)
+static void __init htab_initialize(void)
{
unsigned long table;
unsigned long pteg_count;
- unsigned long mode_rw;
+ unsigned long prot;
unsigned long base = 0, size = 0, limit;
- int i;
-
- extern unsigned long tce_alloc_start, tce_alloc_end;
+ struct memblock_region *reg;
DBG(" -> htab_initialize()\n");
@@ -489,7 +682,7 @@ void __init htab_initialize(void)
/* Initialize page sizes */
htab_init_page_sizes();
- if (cpu_has_feature(CPU_FTR_1T_SEGMENT)) {
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
mmu_kernel_ssize = MMU_SEGSIZE_1T;
mmu_highuser_ssize = MMU_SEGSIZE_1T;
printk(KERN_INFO "Using 1TB segments\n");
@@ -508,6 +701,16 @@ void __init htab_initialize(void)
/* Using a hypervisor which owns the htab */
htab_address = NULL;
_SDR1 = 0;
+#ifdef CONFIG_FA_DUMP
+ /*
+ * If firmware assisted dump is active firmware preserves
+ * the contents of htab along with entire partition memory.
+ * Clear the htab if firmware assisted dump is active so
+ * that we dont end up using old mappings.
+ */
+ if (is_fadump_active() && ppc_md.hpte_clear_all)
+ ppc_md.hpte_clear_all();
+#endif
} else {
/* Find storage for the HPT. Must be contiguous in
* the absolute address space. On cell we want it to be
@@ -516,14 +719,14 @@ void __init htab_initialize(void)
if (machine_is(cell))
limit = 0x80000000;
else
- limit = 0;
+ limit = MEMBLOCK_ALLOC_ANYWHERE;
- table = lmb_alloc_base(htab_size_bytes, htab_size_bytes, limit);
+ table = memblock_alloc_base(htab_size_bytes, htab_size_bytes, limit);
DBG("Hash table allocated at %lx, size: %lx\n", table,
htab_size_bytes);
- htab_address = abs_to_virt(table);
+ htab_address = __va(table);
/* htab absolute addr + encoded htabsize */
_SDR1 = table + __ilog2(pteg_count) - 11;
@@ -535,12 +738,12 @@ void __init htab_initialize(void)
mtspr(SPRN_SDR1, _SDR1);
}
- mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+ prot = pgprot_val(PAGE_KERNEL);
#ifdef CONFIG_DEBUG_PAGEALLOC
- linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT;
- linear_map_hash_slots = __va(lmb_alloc_base(linear_map_hash_count,
- 1, lmb.rmo_size));
+ linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ linear_map_hash_slots = __va(memblock_alloc_base(linear_map_hash_count,
+ 1, ppc64_rma_size));
memset(linear_map_hash_slots, 0, linear_map_hash_count);
#endif /* CONFIG_DEBUG_PAGEALLOC */
@@ -550,15 +753,16 @@ void __init htab_initialize(void)
*/
/* create bolted the linear mapping in the hash table */
- for (i=0; i < lmb.memory.cnt; i++) {
- base = (unsigned long)__va(lmb.memory.region[i].base);
- size = lmb.memory.region[i].size;
+ for_each_memblock(memory, reg) {
+ base = (unsigned long)__va(reg->base);
+ size = reg->size;
- DBG("creating mapping for region: %lx : %lx\n", base, size);
+ DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
+ base, size, prot);
#ifdef CONFIG_U3_DART
/* Do not map the DART space. Fortunately, it will be aligned
- * in such a way that it will not cross two lmb regions and
+ * in such a way that it will not cross two memblock regions and
* will fit within a single 16Mb page.
* The DART space is assumed to be a full 16Mb region even if
* we only use 2Mb of that space. We will use more of it later
@@ -571,22 +775,23 @@ void __init htab_initialize(void)
unsigned long dart_table_end = dart_tablebase + 16 * MB;
if (base != dart_tablebase)
BUG_ON(htab_bolt_mapping(base, dart_tablebase,
- __pa(base), mode_rw,
+ __pa(base), prot,
mmu_linear_psize,
mmu_kernel_ssize));
if ((base + size) > dart_table_end)
BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
base + size,
__pa(dart_table_end),
- mode_rw,
+ prot,
mmu_linear_psize,
mmu_kernel_ssize));
continue;
}
#endif /* CONFIG_U3_DART */
BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
- mode_rw, mmu_linear_psize, mmu_kernel_ssize));
- }
+ prot, mmu_linear_psize, mmu_kernel_ssize));
+ }
+ memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
/*
* If we have a memory_limit and we've allocated TCEs then we need to
@@ -603,7 +808,7 @@ void __init htab_initialize(void)
tce_alloc_start = base + size + 1;
BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
- __pa(tce_alloc_start), mode_rw,
+ __pa(tce_alloc_start), prot,
mmu_linear_psize, mmu_kernel_ssize));
}
@@ -614,11 +819,41 @@ void __init htab_initialize(void)
#undef KB
#undef MB
-void htab_initialize_secondary(void)
+void __init early_init_mmu(void)
{
+ /* Setup initial STAB address in the PACA */
+ get_paca()->stab_real = __pa((u64)&initial_stab);
+ get_paca()->stab_addr = (u64)&initial_stab;
+
+ /* Initialize the MMU Hash table and create the linear mapping
+ * of memory. Has to be done before stab/slb initialization as
+ * this is currently where the page size encoding is obtained
+ */
+ htab_initialize();
+
+ /* Initialize stab / SLB management */
+ if (mmu_has_feature(MMU_FTR_SLB))
+ slb_initialize();
+ else
+ stab_initialize(get_paca()->stab_real);
+}
+
+#ifdef CONFIG_SMP
+void early_init_mmu_secondary(void)
+{
+ /* Initialize hash table for that CPU */
if (!firmware_has_feature(FW_FEATURE_LPAR))
mtspr(SPRN_SDR1, _SDR1);
+
+ /* Initialize STAB/SLB. We use a virtual address as it works
+ * in real mode on pSeries.
+ */
+ if (mmu_has_feature(MMU_FTR_SLB))
+ slb_initialize();
+ else
+ stab_initialize(get_paca()->stab_addr);
}
+#endif /* CONFIG_SMP */
/*
* Called by asm hashtable.S for doing lazy icache flush
@@ -635,7 +870,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
/* page is dirty */
if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
if (trap == 0x400) {
- __flush_dcache_icache(page_address(page));
+ flush_dcache_icache_page(page);
set_bit(PG_arch_1, &page->flags);
} else
pp |= HPTE_R_N;
@@ -643,6 +878,31 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
return pp;
}
+#ifdef CONFIG_PPC_MM_SLICES
+unsigned int get_paca_psize(unsigned long addr)
+{
+ u64 lpsizes;
+ unsigned char *hpsizes;
+ unsigned long index, mask_index;
+
+ if (addr < SLICE_LOW_TOP) {
+ lpsizes = get_paca()->context.low_slices_psize;
+ index = GET_LOW_SLICE_INDEX(addr);
+ return (lpsizes >> (index * 4)) & 0xF;
+ }
+ hpsizes = get_paca()->context.high_slices_psize;
+ index = GET_HIGH_SLICE_INDEX(addr);
+ mask_index = index & 0x1;
+ return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF;
+}
+
+#else
+unsigned int get_paca_psize(unsigned long addr)
+{
+ return get_paca()->context.user_psize;
+}
+#endif
+
/*
* Demote a segment to using 4k pages.
* For now this makes the whole process use 4k pages.
@@ -650,13 +910,13 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
#ifdef CONFIG_PPC_64K_PAGES
void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
{
- if (mm->context.user_psize == MMU_PAGE_4K)
+ if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
return;
- slice_set_user_psize(mm, MMU_PAGE_4K);
+ slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
#ifdef CONFIG_SPU_BASE
spu_flush_all_slbs(mm);
#endif
- if (get_paca()->context.user_psize != MMU_PAGE_4K) {
+ if (get_paca_psize(addr) != MMU_PAGE_4K) {
get_paca()->context = mm->context;
slb_flush_and_rebolt();
}
@@ -671,15 +931,15 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
* Result is 0: full permissions, _PAGE_RW: read-only,
* _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
*/
-static int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
{
- struct subpage_prot_table *spt = pgd_subpage_prot(pgdir);
+ struct subpage_prot_table *spt = &mm->context.spt;
u32 spp = 0;
u32 **sbpm, *sbpp;
if (ea >= spt->maxaddr)
return 0;
- if (ea < 0x100000000) {
+ if (ea < 0x100000000UL) {
/* addresses below 4GB use spt->low_prot */
sbpm = spt->low_prot;
} else {
@@ -701,12 +961,40 @@ static int subpage_protection(pgd_t *pgdir, unsigned long ea)
}
#else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
{
return 0;
}
#endif
+void hash_failure_debug(unsigned long ea, unsigned long access,
+ unsigned long vsid, unsigned long trap,
+ int ssize, int psize, int lpsize, unsigned long pte)
+{
+ if (!printk_ratelimit())
+ return;
+ pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
+ ea, access, current->comm);
+ pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
+ trap, vsid, ssize, psize, lpsize, pte);
+}
+
+static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
+ int psize, bool user_region)
+{
+ if (user_region) {
+ if (psize != get_paca_psize(ea)) {
+ get_paca()->context = mm->context;
+ slb_flush_and_rebolt();
+ }
+ } else if (get_paca()->vmalloc_sllp !=
+ mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+ get_paca()->vmalloc_sllp =
+ mmu_psize_defs[mmu_vmalloc_psize].sllp;
+ slb_vmalloc_update();
+ }
+}
+
/* Result code is:
* 0 - handled
* 1 - normal page fault
@@ -715,22 +1003,19 @@ static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
*/
int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
{
- void *pgdir;
+ enum ctx_state prev_state = exception_enter();
+ pgd_t *pgdir;
unsigned long vsid;
struct mm_struct *mm;
pte_t *ptep;
- cpumask_t tmp;
+ unsigned hugeshift;
+ const struct cpumask *tmp;
int rc, user_region = 0, local = 0;
int psize, ssize;
DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
ea, access, trap);
- if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) {
- DBG_LOW(" out of pgtable range !\n");
- return 1;
- }
-
/* Get region & vsid */
switch (REGION_ID(ea)) {
case USER_REGION_ID:
@@ -738,13 +1023,10 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
mm = current->mm;
if (! mm) {
DBG_LOW(" user region with no mm !\n");
- return 1;
+ rc = 1;
+ goto bail;
}
-#ifdef CONFIG_PPC_MM_SLICES
psize = get_slice_psize(mm, ea);
-#else
- psize = mm->context.user_psize;
-#endif
ssize = user_segment_size(ea);
vsid = get_vsid(mm->context.id, ea, ssize);
break;
@@ -761,62 +1043,94 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
/* Not a valid range
* Send the problem up to do_page_fault
*/
- return 1;
+ rc = 1;
+ goto bail;
}
DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
+ /* Bad address. */
+ if (!vsid) {
+ DBG_LOW("Bad address!\n");
+ rc = 1;
+ goto bail;
+ }
/* Get pgdir */
pgdir = mm->pgd;
- if (pgdir == NULL)
- return 1;
+ if (pgdir == NULL) {
+ rc = 1;
+ goto bail;
+ }
/* Check CPU locality */
- tmp = cpumask_of_cpu(smp_processor_id());
- if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
+ tmp = cpumask_of(smp_processor_id());
+ if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
local = 1;
-#ifdef CONFIG_HUGETLB_PAGE
- /* Handle hugepage regions */
- if (HPAGE_SHIFT && psize == mmu_huge_psize) {
- DBG_LOW(" -> huge page !\n");
- return hash_huge_page(mm, access, ea, vsid, local, trap);
- }
-#endif /* CONFIG_HUGETLB_PAGE */
-
#ifndef CONFIG_PPC_64K_PAGES
- /* If we use 4K pages and our psize is not 4K, then we are hitting
- * a special driver mapping, we need to align the address before
- * we fetch the PTE
+ /* If we use 4K pages and our psize is not 4K, then we might
+ * be hitting a special driver mapping, and need to align the
+ * address before we fetch the PTE.
+ *
+ * It could also be a hugepage mapping, in which case this is
+ * not necessary, but it's not harmful, either.
*/
if (psize != MMU_PAGE_4K)
ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
#endif /* CONFIG_PPC_64K_PAGES */
/* Get PTE and page size from page tables */
- ptep = find_linux_pte(pgdir, ea);
+ ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
- return 1;
+ rc = 1;
+ goto bail;
}
-#ifndef CONFIG_PPC_64K_PAGES
- DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
-#else
- DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
- pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
+ /* Add _PAGE_PRESENT to the required access perm */
+ access |= _PAGE_PRESENT;
+
/* Pre-check access permissions (will be re-checked atomically
* in __hash_page_XX but this pre-check is a fast path
*/
if (access & ~pte_val(*ptep)) {
DBG_LOW(" no access !\n");
- return 1;
+ rc = 1;
+ goto bail;
}
+ if (hugeshift) {
+ if (pmd_trans_huge(*(pmd_t *)ptep))
+ rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+ trap, local, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+ else
+ rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+ local, ssize, hugeshift, psize);
+#else
+ else {
+ /*
+ * if we have hugeshift, and is not transhuge with
+ * hugetlb disabled, something is really wrong.
+ */
+ rc = 1;
+ WARN_ON(1);
+ }
+#endif
+ check_paca_psize(ea, mm, psize, user_region);
+
+ goto bail;
+ }
+
+#ifndef CONFIG_PPC_64K_PAGES
+ DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
+#else
+ DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
+ pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
/* Do actual hashing */
#ifdef CONFIG_PPC_64K_PAGES
/* If _PAGE_4K_PFN is set, make sure this is a 4k segment */
- if (pte_val(*ptep) & _PAGE_4K_PFN) {
+ if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
demote_segment_4k(mm, ea);
psize = MMU_PAGE_4K;
}
@@ -844,17 +1158,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
#endif
}
}
- if (user_region) {
- if (psize != get_paca()->context.user_psize) {
- get_paca()->context = mm->context;
- slb_flush_and_rebolt();
- }
- } else if (get_paca()->vmalloc_sllp !=
- mmu_psize_defs[mmu_vmalloc_psize].sllp) {
- get_paca()->vmalloc_sllp =
- mmu_psize_defs[mmu_vmalloc_psize].sllp;
- slb_vmalloc_update();
- }
+
+ check_paca_psize(ea, mm, psize, user_region);
#endif /* CONFIG_PPC_64K_PAGES */
#ifdef CONFIG_PPC_HAS_HASH_64K
@@ -863,7 +1168,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
else
#endif /* CONFIG_PPC_HAS_HASH_64K */
{
- int spp = subpage_protection(pgdir, ea);
+ int spp = subpage_protection(mm, ea);
if (access & spp)
rc = -2;
else
@@ -871,6 +1176,12 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
local, ssize, spp);
}
+ /* Dump some info in case of hash insertion failure, they should
+ * never happen so it is really useful to know if/when they do
+ */
+ if (rc == -1)
+ hash_failure_debug(ea, access, vsid, trap, ssize, psize,
+ psize, pte_val(*ptep));
#ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
#else
@@ -878,6 +1189,9 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
pte_val(*(ptep + PTRS_PER_PTE)));
#endif
DBG_LOW(" -> rc=%d\n", rc);
+
+bail:
+ exception_exit(prev_state);
return rc;
}
EXPORT_SYMBOL_GPL(hash_page);
@@ -885,13 +1199,12 @@ EXPORT_SYMBOL_GPL(hash_page);
void hash_preload(struct mm_struct *mm, unsigned long ea,
unsigned long access, unsigned long trap)
{
+ int hugepage_shift;
unsigned long vsid;
- void *pgdir;
+ pgd_t *pgdir;
pte_t *ptep;
- cpumask_t mask;
unsigned long flags;
- int local = 0;
- int ssize;
+ int rc, ssize, local = 0;
BUG_ON(REGION_ID(ea) != USER_REGION_ID);
@@ -908,10 +1221,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
pgdir = mm->pgd;
if (pgdir == NULL)
return;
- ptep = find_linux_pte(pgdir, ea);
- if (!ptep)
+
+ /* Get VSID */
+ ssize = user_segment_size(ea);
+ vsid = get_vsid(mm->context.id, ea, ssize);
+ if (!vsid)
return;
+ /*
+ * Hash doesn't like irqs. Walking linux page table with irq disabled
+ * saves us from holding multiple locks.
+ */
+ local_irq_save(flags);
+ /*
+ * THP pages use update_mmu_cache_pmd. We don't do
+ * hash preload there. Hence can ignore THP here
+ */
+ ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
+ if (!ptep)
+ goto out_exit;
+
+ WARN_ON(hugepage_shift);
#ifdef CONFIG_PPC_64K_PAGES
/* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
* a 64K kernel), then we don't preload, hash_page() will take
@@ -920,52 +1250,73 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
* page size demotion here
*/
if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
- return;
+ goto out_exit;
#endif /* CONFIG_PPC_64K_PAGES */
- /* Get VSID */
- ssize = user_segment_size(ea);
- vsid = get_vsid(mm->context.id, ea, ssize);
-
- /* Hash doesn't like irqs */
- local_irq_save(flags);
-
/* Is that local to this CPU ? */
- mask = cpumask_of_cpu(smp_processor_id());
- if (cpus_equal(mm->cpu_vm_mask, mask))
+ if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
local = 1;
/* Hash it in */
#ifdef CONFIG_PPC_HAS_HASH_64K
if (mm->context.user_psize == MMU_PAGE_64K)
- __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize);
+ rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize);
else
#endif /* CONFIG_PPC_HAS_HASH_64K */
- __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize,
- subpage_protection(pgdir, ea));
+ rc = __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize,
+ subpage_protection(mm, ea));
+ /* Dump some info in case of hash insertion failure, they should
+ * never happen so it is really useful to know if/when they do
+ */
+ if (rc == -1)
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ mm->context.user_psize,
+ mm->context.user_psize,
+ pte_val(*ptep));
+out_exit:
local_irq_restore(flags);
}
/* WARNING: This is called from hash_low_64.S, if you change this prototype,
* do not forget to update the assembly call site !
*/
-void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize,
+void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
int local)
{
unsigned long hash, index, shift, hidx, slot;
- DBG_LOW("flush_hash_page(va=%016x)\n", va);
- pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
- hash = hpt_hash(va, shift, ssize);
+ DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
+ pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+ hash = hpt_hash(vpn, shift, ssize);
hidx = __rpte_to_hidx(pte, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
- DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx);
- ppc_md.hpte_invalidate(slot, va, psize, ssize, local);
+ DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
+ /*
+ * We use same base page size and actual psize, because we don't
+ * use these functions for hugepage
+ */
+ ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
} pte_iterate_hashed_end();
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ /* Transactions are not aborted by tlbiel, only tlbie.
+ * Without, syncing a page back to a block device w/ PIO could pick up
+ * transactional data (bad!) so we force an abort here. Before the
+ * sync the page will be made read-only, which will flush_hash_page.
+ * BIG ISSUE here: if the kernel uses a page from userspace without
+ * unmapping it first, it may see the speculated version.
+ */
+ if (local && cpu_has_feature(CPU_FTR_TM) &&
+ current->thread.regs &&
+ MSR_TM_ACTIVE(current->thread.regs->msr)) {
+ tm_enable();
+ tm_abort(TM_CAUSE_TLBI);
+ }
+#endif
}
void flush_hash_range(unsigned long number, int local)
@@ -978,7 +1329,7 @@ void flush_hash_range(unsigned long number, int local)
&__get_cpu_var(ppc64_tlb_batch);
for (i = 0; i < number; i++)
- flush_hash_page(batch->vaddr[i], batch->pte[i],
+ flush_hash_page(batch->vpn[i], batch->pte[i],
batch->psize, batch->ssize, local);
}
}
@@ -989,6 +1340,8 @@ void flush_hash_range(unsigned long number, int local)
*/
void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
{
+ enum ctx_state prev_state = exception_enter();
+
if (user_mode(regs)) {
#ifdef CONFIG_PPC_SUBPAGE_PROT
if (rc == -2)
@@ -998,24 +1351,64 @@ void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
_exception(SIGBUS, regs, BUS_ADRERR, address);
} else
bad_page_fault(regs, address, SIGBUS);
+
+ exception_exit(prev_state);
+}
+
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+ unsigned long pa, unsigned long rflags,
+ unsigned long vflags, int psize, int ssize)
+{
+ unsigned long hpte_group;
+ long slot;
+
+repeat:
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+
+ /* Insert into the hash table, primary slot */
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
+ psize, psize, ssize);
+
+ /* Primary is full, try the secondary */
+ if (unlikely(slot == -1)) {
+ hpte_group = ((~hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
+ vflags | HPTE_V_SECONDARY,
+ psize, psize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP)&~0x7UL;
+
+ ppc_md.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+
+ return slot;
}
#ifdef CONFIG_DEBUG_PAGEALLOC
static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
{
- unsigned long hash, hpteg;
+ unsigned long hash;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
- unsigned long mode = _PAGE_ACCESSED | _PAGE_DIRTY |
- _PAGE_COHERENT | PP_RWXX | HPTE_R_N;
- int ret;
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+ unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
+ long ret;
+
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+
+ /* Don't create HPTE entries for bad address */
+ if (!vsid)
+ return;
- hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
- hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+ ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+ HPTE_V_BOLTED,
+ mmu_linear_psize, mmu_kernel_ssize);
- ret = ppc_md.hpte_insert(hpteg, va, __pa(vaddr),
- mode, HPTE_V_BOLTED,
- mmu_linear_psize, mmu_kernel_ssize);
BUG_ON (ret < 0);
spin_lock(&linear_map_hash_lock);
BUG_ON(linear_map_hash_slots[lmi] & 0x80);
@@ -1027,9 +1420,9 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
{
unsigned long hash, hidx, slot;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
- hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
spin_lock(&linear_map_hash_lock);
BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
hidx = linear_map_hash_slots[lmi] & 0x7f;
@@ -1039,7 +1432,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
- ppc_md.hpte_invalidate(slot, va, mmu_linear_psize, mmu_kernel_ssize, 0);
+ ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
+ mmu_kernel_ssize, 0);
}
void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1061,3 +1455,23 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
local_irq_restore(flags);
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ /* On LPAR systems, the first entry is our RMA region,
+ * non-LPAR 64-bit hash MMU systems don't have a limitation
+ * on real mode access, but using the first entry works well
+ * enough. We also clamp it to 1G to avoid some funky things
+ * such as RTAS bugs etc...
+ */
+ ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
+
+ /* Finally limit subsequent allocations */
+ memblock_set_current_limit(ppc64_rma_size);
+}
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
new file mode 100644
index 00000000000..e7450bdbe83
--- /dev/null
+++ b/arch/powerpc/mm/highmem.c
@@ -0,0 +1,86 @@
+/*
+ * highmem.c: virtual kernel memory mappings for high memory
+ *
+ * PowerPC version, stolen from the i386 version.
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ * Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terrabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ *
+ * Reworked for PowerPC by various contributors. Moved from
+ * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp.
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+/*
+ * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
+ * gives a more generic (and caching) interface. But kmap_atomic can
+ * be used in IRQ contexts, so in some (very limited) cases we need
+ * it.
+ */
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+ unsigned long vaddr;
+ int idx, type;
+
+ /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+ pagefault_disable();
+ if (!PageHighMem(page))
+ return page_address(page);
+
+ type = kmap_atomic_idx_push();
+ idx = type + KM_TYPE_NR*smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+#ifdef CONFIG_DEBUG_HIGHMEM
+ BUG_ON(!pte_none(*(kmap_pte-idx)));
+#endif
+ __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1);
+ local_flush_tlb_page(NULL, vaddr);
+
+ return (void*) vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_prot);
+
+void __kunmap_atomic(void *kvaddr)
+{
+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+ int type;
+
+ if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
+ pagefault_enable();
+ return;
+ }
+
+ type = kmap_atomic_idx();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+ {
+ unsigned int idx;
+
+ idx = type + KM_TYPE_NR * smp_processor_id();
+ BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+
+ /*
+ * force other mappings to Oops if they'll try to access
+ * this pte without first remap it
+ */
+ pte_clear(&init_mm, vaddr, kmap_pte-idx);
+ local_flush_tlb_page(NULL, vaddr);
+ }
+#endif
+
+ kmap_atomic_idx_pop();
+ pagefault_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 00000000000..826893fcb3a
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+ pmd_t *pmdp, unsigned long trap, int local, int ssize,
+ unsigned int psize)
+{
+ unsigned int index, valid;
+ unsigned char *hpte_slot_array;
+ unsigned long rflags, pa, hidx;
+ unsigned long old_pmd, new_pmd;
+ int ret, lpsize = MMU_PAGE_16M;
+ unsigned long vpn, hash, shift, slot;
+
+ /*
+ * atomically mark the linux large page PMD busy and dirty
+ */
+ do {
+ old_pmd = pmd_val(*pmdp);
+ /* If PMD busy, retry the access */
+ if (unlikely(old_pmd & _PAGE_BUSY))
+ return 0;
+ /* If PMD is trans splitting retry the access */
+ if (unlikely(old_pmd & _PAGE_SPLITTING))
+ return 0;
+ /* If PMD permissions don't match, take page fault */
+ if (unlikely(access & ~old_pmd))
+ return 1;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access
+ */
+ new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_RW)
+ new_pmd |= _PAGE_DIRTY;
+ } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
+ old_pmd, new_pmd));
+ /*
+ * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+ * need to add in 0x1 if it's a read-only user page
+ */
+ rflags = new_pmd & _PAGE_USER;
+ if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
+ (new_pmd & _PAGE_DIRTY)))
+ rflags |= 0x1;
+ /*
+ * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+ */
+ rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
+
+#if 0
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+ }
+#endif
+ /*
+ * Find the slot index details for this ea, using base page size.
+ */
+ shift = mmu_psize_defs[psize].shift;
+ index = (ea & ~HPAGE_PMD_MASK) >> shift;
+ BUG_ON(index >= 4096);
+
+ vpn = hpt_vpn(ea, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ hpte_slot_array = get_hpte_slot_array(pmdp);
+
+ valid = hpte_valid(hpte_slot_array, index);
+ if (valid) {
+ /* update the hpte bits */
+ hidx = hpte_hash_index(hpte_slot_array, index);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+ psize, lpsize, ssize, local);
+ /*
+ * We failed to update, try to insert a new entry.
+ */
+ if (ret == -1) {
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ valid = 0;
+ new_pmd &= ~_PAGE_HPTEFLAGS;
+ hpte_slot_array[index] = 0;
+ } else
+ /* clear the busy bits and set the hash pte bits */
+ new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+ }
+
+ if (!valid) {
+ unsigned long hpte_group;
+
+ /* insert new entry */
+ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+repeat:
+ hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+ /* clear the busy bits and set the hash pte bits */
+ new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+
+ /* Add in WIMG bits */
+ rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+ _PAGE_GUARDED));
+ /*
+ * enable the memory coherence always
+ */
+ rflags |= HPTE_R_M;
+
+ /* Insert into the hash table, primary slot */
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ psize, lpsize, ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ hpte_group = ((~hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+ rflags, HPTE_V_SECONDARY,
+ psize, lpsize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+
+ ppc_md.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pmd and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *pmdp = __pmd(old_pmd);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ psize, lpsize, old_pmd);
+ return -1;
+ }
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ mark_hpte_slot_valid(hpte_slot_array, index, slot);
+ }
+ /*
+ * No need to use ldarx/stdcx here
+ */
+ *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+ return 0;
+}
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
new file mode 100644
index 00000000000..5e4ee257390
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -0,0 +1,153 @@
+/*
+ * PPC Huge TLB Page Support for Book3E MMU
+ *
+ * Copyright (C) 2009 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
+ *
+ */
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC64
+static inline int tlb1_next(void)
+{
+ struct paca_struct *paca = get_paca();
+ struct tlb_core_data *tcd;
+ int this, next;
+
+ tcd = paca->tcd_ptr;
+ this = tcd->esel_next;
+
+ next = this + 1;
+ if (next >= tcd->esel_max)
+ next = tcd->esel_first;
+
+ tcd->esel_next = next;
+ return this;
+}
+#else
+static inline int tlb1_next(void)
+{
+ int index, ncams;
+
+ ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+ index = __get_cpu_var(next_tlbcam_idx);
+
+ /* Just round-robin the entries and wrap when we hit the end */
+ if (unlikely(index == ncams - 1))
+ __get_cpu_var(next_tlbcam_idx) = tlbcam_index;
+ else
+ __get_cpu_var(next_tlbcam_idx)++;
+
+ return index;
+}
+#endif /* !PPC64 */
+#endif /* FSL */
+
+static inline int mmu_get_tsize(int psize)
+{
+ return mmu_psize_defs[psize].enc;
+}
+
+static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
+{
+ int found = 0;
+
+ mtspr(SPRN_MAS6, pid << 16);
+ if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
+ asm volatile(
+ "li %0,0\n"
+ "tlbsx. 0,%1\n"
+ "bne 1f\n"
+ "li %0,1\n"
+ "1:\n"
+ : "=&r"(found) : "r"(ea));
+ } else {
+ asm volatile(
+ "tlbsx 0,%1\n"
+ "mfspr %0,0x271\n"
+ "srwi %0,%0,31\n"
+ : "=&r"(found) : "r"(ea));
+ }
+
+ return found;
+}
+
+void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
+ pte_t pte)
+{
+ unsigned long mas1, mas2;
+ u64 mas7_3;
+ unsigned long psize, tsize, shift;
+ unsigned long flags;
+ struct mm_struct *mm;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ int index;
+#endif
+
+ if (unlikely(is_kernel_addr(ea)))
+ return;
+
+ mm = vma->vm_mm;
+
+#ifdef CONFIG_PPC_MM_SLICES
+ psize = get_slice_psize(mm, ea);
+ tsize = mmu_get_tsize(psize);
+ shift = mmu_psize_defs[psize].shift;
+#else
+ psize = vma_mmu_pagesize(vma);
+ shift = __ilog2(psize);
+ tsize = shift - 10;
+#endif
+
+ /*
+ * We can't be interrupted while we're setting up the MAS
+ * regusters or after we've confirmed that no tlb exists.
+ */
+ local_irq_save(flags);
+
+ if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
+ local_irq_restore(flags);
+ return;
+ }
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ /* We have to use the CAM(TLB1) on FSL parts for hugepages */
+ index = tlb1_next();
+ mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
+#endif
+
+ mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
+ mas2 = ea & ~((1UL << shift) - 1);
+ mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
+ mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
+ mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
+ if (!pte_dirty(pte))
+ mas7_3 &= ~(MAS3_SW|MAS3_UW);
+
+ mtspr(SPRN_MAS1, mas1);
+ mtspr(SPRN_MAS2, mas2);
+
+ if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
+ mtspr(SPRN_MAS7_MAS3, mas7_3);
+ } else {
+ if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+ mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+ mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
+ }
+
+ asm volatile ("tlbwe");
+
+ local_irq_restore(flags);
+}
+
+void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ struct hstate *hstate = hstate_file(vma->vm_file);
+ unsigned long tsize = huge_page_shift(hstate) - 10;
+
+ __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0);
+}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
new file mode 100644
index 00000000000..a5bcf930119
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -0,0 +1,129 @@
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+ unsigned long pa, unsigned long rlags,
+ unsigned long vflags, int psize, int ssize);
+
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+ pte_t *ptep, unsigned long trap, int local, int ssize,
+ unsigned int shift, unsigned int mmu_psize)
+{
+ unsigned long vpn;
+ unsigned long old_pte, new_pte;
+ unsigned long rflags, pa, sz;
+ long slot;
+
+ BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+ /* Search the Linux page table for a match with va */
+ vpn = hpt_vpn(ea, vsid, ssize);
+
+ /* At this point, we have a pte (old_pte) which can be used to build
+ * or update an HPTE. There are 2 cases:
+ *
+ * 1. There is a valid (present) pte with no associated HPTE (this is
+ * the most common case)
+ * 2. There is a valid (present) pte with an associated HPTE. The
+ * current values of the pp bits in the HPTE prevent access
+ * because we are doing software DIRTY bit management and the
+ * page is currently not DIRTY.
+ */
+
+
+ do {
+ old_pte = pte_val(*ptep);
+ /* If PTE busy, retry the access */
+ if (unlikely(old_pte & _PAGE_BUSY))
+ return 0;
+ /* If PTE permissions don't match, take page fault */
+ if (unlikely(access & ~old_pte))
+ return 1;
+ /* Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access */
+ new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_RW)
+ new_pte |= _PAGE_DIRTY;
+ } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+ old_pte, new_pte));
+
+ rflags = 0x2 | (!(new_pte & _PAGE_RW));
+ /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+ rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+ sz = ((1UL) << shift);
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ /* No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+ /* Check if pte already has an hpte (case 2) */
+ if (unlikely(old_pte & _PAGE_HASHPTE)) {
+ /* There MIGHT be an HPTE for this pte */
+ unsigned long hash, slot;
+
+ hash = hpt_hash(vpn, shift, ssize);
+ if (old_pte & _PAGE_F_SECOND)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += (old_pte & _PAGE_F_GIX) >> 12;
+
+ if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
+ mmu_psize, ssize, local) == -1)
+ old_pte &= ~_PAGE_HPTEFLAGS;
+ }
+
+ if (likely(!(old_pte & _PAGE_HASHPTE))) {
+ unsigned long hash = hpt_hash(vpn, shift, ssize);
+
+ pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+ /* clear HPTE slot informations in new PTE */
+#ifdef CONFIG_PPC_64K_PAGES
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
+#else
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+#endif
+ /* Add in WIMG bits */
+ rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+ _PAGE_COHERENT | _PAGE_GUARDED));
+ /*
+ * enable the memory coherence always
+ */
+ rflags |= HPTE_R_M;
+
+ slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
+ mmu_psize, ssize);
+
+ /*
+ * Hypervisor failure. Restore old pte and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *ptep = __pte(old_pte);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ mmu_psize, mmu_psize, old_pte);
+ return -1;
+ }
+
+ new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+ }
+
+ /*
+ * No need to use ldarx/stdcx here
+ */
+ *ptep = __pte(new_pte & ~_PAGE_BUSY);
+ return 0;
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a02266dad21..7e70ae968e5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1,175 +1,529 @@
/*
- * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ * PPC Huge TLB Page Support for Kernel.
*
* Copyright (C) 2003 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
*
* Based on the IA-32 version:
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
*/
-#include <linux/init.h>
-#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/pagemap.h>
+#include <linux/io.h>
#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
+#include <linux/hugetlb.h>
+#include <linux/export.h>
+#include <linux/of_fdt.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/moduleparam.h>
+#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/machdep.h>
-#include <asm/cputable.h>
-#include <asm/spu.h>
+#include <asm/setup.h>
+#include <asm/hugetlb.h>
-#define HPAGE_SHIFT_64K 16
-#define HPAGE_SHIFT_16M 24
+#ifdef CONFIG_HUGETLB_PAGE
-#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
-#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#define PAGE_SHIFT_64K 16
+#define PAGE_SHIFT_16M 24
+#define PAGE_SHIFT_16G 34
-unsigned int hugepte_shift;
-#define PTRS_PER_HUGEPTE (1 << hugepte_shift)
-#define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << hugepte_shift)
+unsigned int HPAGE_SHIFT;
-#define HUGEPD_SHIFT (HPAGE_SHIFT + hugepte_shift)
-#define HUGEPD_SIZE (1UL << HUGEPD_SHIFT)
-#define HUGEPD_MASK (~(HUGEPD_SIZE-1))
+/*
+ * Tracks gpages after the device tree is scanned and before the
+ * huge_boot_pages list is ready. On non-Freescale implementations, this is
+ * just used to track 16G pages and so is a single array. FSL-based
+ * implementations may have more than one gpage size, so we need multiple
+ * arrays
+ */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define MAX_NUMBER_GPAGES 128
+struct psize_gpages {
+ u64 gpage_list[MAX_NUMBER_GPAGES];
+ unsigned int nr_gpages;
+};
+static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
+#else
+#define MAX_NUMBER_GPAGES 1024
+static u64 gpage_freearray[MAX_NUMBER_GPAGES];
+static unsigned nr_gpages;
+#endif
-#define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM])
+#define hugepd_none(hpd) ((hpd).pd == 0)
-/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
- * will choke on pointers to hugepte tables, which is handy for
- * catching screwups early. */
-#define HUGEPD_OK 0x1
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
-typedef struct { unsigned long pd; } hugepd_t;
+/*
+ * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
+ * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
+ */
+int pmd_huge(pmd_t pmd)
+{
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pmd_val(pmd) & 0x3) != 0x0);
+}
-#define hugepd_none(hpd) ((hpd).pd == 0)
+int pud_huge(pud_t pud)
+{
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pud_val(pud) & 0x3) != 0x0);
+}
+
+int pgd_huge(pgd_t pgd)
+{
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pgd_val(pgd) & 0x3) != 0x0);
+}
+#else
+int pmd_huge(pmd_t pmd)
+{
+ return 0;
+}
-static inline pte_t *hugepd_page(hugepd_t hpd)
+int pud_huge(pud_t pud)
{
- BUG_ON(!(hpd.pd & HUGEPD_OK));
- return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+ return 0;
}
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+int pgd_huge(pgd_t pgd)
{
- unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
- pte_t *dir = hugepd_page(*hpdp);
+ return 0;
+}
+#endif
- return dir + idx;
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+ /* Only called for hugetlbfs pages, hence can ignore THP */
+ return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
}
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
- unsigned long address)
+ unsigned long address, unsigned pdshift, unsigned pshift)
{
- pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
- GFP_KERNEL|__GFP_REPEAT);
+ struct kmem_cache *cachep;
+ pte_t *new;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ int i;
+ int num_hugepd = 1 << (pshift - pdshift);
+ cachep = hugepte_cache;
+#else
+ cachep = PGT_CACHE(pdshift - pshift);
+#endif
+
+ new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
+
+ BUG_ON(pshift > HUGEPD_SHIFT_MASK);
+ BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
if (! new)
return -ENOMEM;
spin_lock(&mm->page_table_lock);
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ /*
+ * We have multiple higher-level entries that point to the same
+ * actual pte location. Fill in each as we go and backtrack on error.
+ * We need all of these so the DTLB pgtable walk code can find the
+ * right higher-level entry without knowing if it's a hugepage or not.
+ */
+ for (i = 0; i < num_hugepd; i++, hpdp++) {
+ if (unlikely(!hugepd_none(*hpdp)))
+ break;
+ else
+ /* We use the old format for PPC_FSL_BOOK3E */
+ hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+ }
+ /* If we bailed from the for loop early, an error occurred, clean up */
+ if (i < num_hugepd) {
+ for (i = i - 1 ; i >= 0; i--, hpdp--)
+ hpdp->pd = 0;
+ kmem_cache_free(cachep, new);
+ }
+#else
if (!hugepd_none(*hpdp))
- kmem_cache_free(huge_pgtable_cache, new);
- else
- hpdp->pd = (unsigned long)new | HUGEPD_OK;
+ kmem_cache_free(cachep, new);
+ else {
+#ifdef CONFIG_PPC_BOOK3S_64
+ hpdp->pd = (unsigned long)new |
+ (shift_to_mmu_psize(pshift) << 2);
+#else
+ hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+#endif
+ }
+#endif
spin_unlock(&mm->page_table_lock);
return 0;
}
-/* Base page size affects how we walk hugetlb page tables */
-#ifdef CONFIG_PPC_64K_PAGES
-#define hpmd_offset(pud, addr) pmd_offset(pud, addr)
-#define hpmd_alloc(mm, pud, addr) pmd_alloc(mm, pud, addr)
+/*
+ * These macros define how to determine which level of the page table holds
+ * the hpdp.
+ */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
+#define HUGEPD_PUD_SHIFT PUD_SHIFT
#else
-static inline
-pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
-{
- if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
- return pmd_offset(pud, addr);
- else
- return (pmd_t *) pud;
-}
-static inline
-pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
-{
- if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
- return pmd_alloc(mm, pud, addr);
- else
- return (pmd_t *) pud;
-}
+#define HUGEPD_PGD_SHIFT PUD_SHIFT
+#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif
-/* Modelled after find_linux_pte() */
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
pgd_t *pg;
pud_t *pu;
pmd_t *pm;
+ hugepd_t *hpdp = NULL;
+ unsigned pshift = __ffs(sz);
+ unsigned pdshift = PGDIR_SHIFT;
- BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
-
- addr &= HPAGE_MASK;
-
+ addr &= ~(sz-1);
pg = pgd_offset(mm, addr);
- if (!pgd_none(*pg)) {
- pu = pud_offset(pg, addr);
- if (!pud_none(*pu)) {
- pm = hpmd_offset(pu, addr);
- if (!pmd_none(*pm))
- return hugepte_offset((hugepd_t *)pm, addr);
+
+ if (pshift == PGDIR_SHIFT)
+ /* 16GB huge page */
+ return (pte_t *) pg;
+ else if (pshift > PUD_SHIFT)
+ /*
+ * We need to use hugepd table
+ */
+ hpdp = (hugepd_t *)pg;
+ else {
+ pdshift = PUD_SHIFT;
+ pu = pud_alloc(mm, pg, addr);
+ if (pshift == PUD_SHIFT)
+ return (pte_t *)pu;
+ else if (pshift > PMD_SHIFT)
+ hpdp = (hugepd_t *)pu;
+ else {
+ pdshift = PMD_SHIFT;
+ pm = pmd_alloc(mm, pu, addr);
+ if (pshift == PMD_SHIFT)
+ /* 16MB hugepage */
+ return (pte_t *)pm;
+ else
+ hpdp = (hugepd_t *)pm;
}
}
+ if (!hpdp)
+ return NULL;
- return NULL;
+ BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+ if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+ return NULL;
+
+ return hugepte_offset(hpdp, addr, pdshift);
}
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+#else
+
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
pgd_t *pg;
pud_t *pu;
pmd_t *pm;
hugepd_t *hpdp = NULL;
+ unsigned pshift = __ffs(sz);
+ unsigned pdshift = PGDIR_SHIFT;
- BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
-
- addr &= HPAGE_MASK;
+ addr &= ~(sz-1);
pg = pgd_offset(mm, addr);
- pu = pud_alloc(mm, pg, addr);
- if (pu) {
- pm = hpmd_alloc(mm, pu, addr);
- if (pm)
+ if (pshift >= HUGEPD_PGD_SHIFT) {
+ hpdp = (hugepd_t *)pg;
+ } else {
+ pdshift = PUD_SHIFT;
+ pu = pud_alloc(mm, pg, addr);
+ if (pshift >= HUGEPD_PUD_SHIFT) {
+ hpdp = (hugepd_t *)pu;
+ } else {
+ pdshift = PMD_SHIFT;
+ pm = pmd_alloc(mm, pu, addr);
hpdp = (hugepd_t *)pm;
+ }
}
- if (! hpdp)
+ if (!hpdp)
return NULL;
- if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+ BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+ if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
return NULL;
- return hugepte_offset(hpdp, addr);
+ return hugepte_offset(hpdp, addr, pdshift);
+}
+#endif
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/* Build list of addresses of gigantic pages. This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+{
+ unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
+ int i;
+
+ if (addr == 0)
+ return;
+
+ gpage_freearray[idx].nr_gpages = number_of_pages;
+
+ for (i = 0; i < number_of_pages; i++) {
+ gpage_freearray[idx].gpage_list[i] = addr;
+ addr += page_size;
+ }
+}
+
+/*
+ * Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+ struct huge_bootmem_page *m;
+ int idx = shift_to_mmu_psize(huge_page_shift(hstate));
+ int nr_gpages = gpage_freearray[idx].nr_gpages;
+
+ if (nr_gpages == 0)
+ return 0;
+
+#ifdef CONFIG_HIGHMEM
+ /*
+ * If gpages can be in highmem we can't use the trick of storing the
+ * data structure in the page; allocate space for this
+ */
+ m = alloc_bootmem(sizeof(struct huge_bootmem_page));
+ m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
+#else
+ m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
+#endif
+
+ list_add(&m->list, &huge_boot_pages);
+ gpage_freearray[idx].nr_gpages = nr_gpages;
+ gpage_freearray[idx].gpage_list[nr_gpages] = 0;
+ m->hstate = hstate;
+
+ return 1;
+}
+/*
+ * Scan the command line hugepagesz= options for gigantic pages; store those in
+ * a list that we use to allocate the memory once all options are parsed.
+ */
+
+unsigned long gpage_npages[MMU_PAGE_COUNT];
+
+static int __init do_gpage_early_setup(char *param, char *val,
+ const char *unused)
+{
+ static phys_addr_t size;
+ unsigned long npages;
+
+ /*
+ * The hugepagesz and hugepages cmdline options are interleaved. We
+ * use the size variable to keep track of whether or not this was done
+ * properly and skip over instances where it is incorrect. Other
+ * command-line parsing code will issue warnings, so we don't need to.
+ *
+ */
+ if ((strcmp(param, "default_hugepagesz") == 0) ||
+ (strcmp(param, "hugepagesz") == 0)) {
+ size = memparse(val, NULL);
+ } else if (strcmp(param, "hugepages") == 0) {
+ if (size != 0) {
+ if (sscanf(val, "%lu", &npages) <= 0)
+ npages = 0;
+ gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
+ size = 0;
+ }
+ }
+ return 0;
}
+
+/*
+ * This function allocates physical space for pages that are larger than the
+ * buddy allocator can handle. We want to allocate these in highmem because
+ * the amount of lowmem is limited. This means that this function MUST be
+ * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
+ * allocate to grab highmem.
+ */
+void __init reserve_hugetlb_gpages(void)
+{
+ static __initdata char cmdline[COMMAND_LINE_SIZE];
+ phys_addr_t size, base;
+ int i;
+
+ strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
+ &do_gpage_early_setup);
+
+ /*
+ * Walk gpage list in reverse, allocating larger page sizes first.
+ * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
+ * When we reach the point in the list where pages are no longer
+ * considered gpages, we're done.
+ */
+ for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
+ if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
+ continue;
+ else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
+ break;
+
+ size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
+ base = memblock_alloc_base(size * gpage_npages[i], size,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ add_gpage(base, size, gpage_npages[i]);
+ }
+}
+
+#else /* !PPC_FSL_BOOK3E */
+
+/* Build list of addresses of gigantic pages. This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+{
+ if (!addr)
+ return;
+ while (number_of_pages > 0) {
+ gpage_freearray[nr_gpages] = addr;
+ nr_gpages++;
+ number_of_pages--;
+ addr += page_size;
+ }
+}
+
+/* Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+ struct huge_bootmem_page *m;
+ if (nr_gpages == 0)
+ return 0;
+ m = phys_to_virt(gpage_freearray[--nr_gpages]);
+ gpage_freearray[nr_gpages] = 0;
+ list_add(&m->list, &huge_boot_pages);
+ m->hstate = hstate;
+ return 1;
+}
+#endif
+
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
return 0;
}
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define HUGEPD_FREELIST_SIZE \
+ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
+
+struct hugepd_freelist {
+ struct rcu_head rcu;
+ unsigned int index;
+ void *ptes[0];
+};
+
+static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
+
+static void hugepd_free_rcu_callback(struct rcu_head *head)
+{
+ struct hugepd_freelist *batch =
+ container_of(head, struct hugepd_freelist, rcu);
+ unsigned int i;
+
+ for (i = 0; i < batch->index; i++)
+ kmem_cache_free(hugepte_cache, batch->ptes[i]);
+
+ free_page((unsigned long)batch);
+}
+
+static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
+{
+ struct hugepd_freelist **batchp;
+
+ batchp = &get_cpu_var(hugepd_freelist_cur);
+
+ if (atomic_read(&tlb->mm->mm_users) < 2 ||
+ cpumask_equal(mm_cpumask(tlb->mm),
+ cpumask_of(smp_processor_id()))) {
+ kmem_cache_free(hugepte_cache, hugepte);
+ put_cpu_var(hugepd_freelist_cur);
+ return;
+ }
+
+ if (*batchp == NULL) {
+ *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
+ (*batchp)->index = 0;
+ }
+
+ (*batchp)->ptes[(*batchp)->index++] = hugepte;
+ if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
+ call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
+ *batchp = NULL;
+ }
+ put_cpu_var(hugepd_freelist_cur);
+}
+#endif
+
+static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
+ unsigned long start, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
{
pte_t *hugepte = hugepd_page(*hpdp);
+ int i;
+
+ unsigned long pdmask = ~((1UL << pdshift) - 1);
+ unsigned int num_hugepd = 1;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ /* Note: On fsl the hpdp may be the first of several */
+ num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
+#else
+ unsigned int shift = hugepd_shift(*hpdp);
+#endif
+
+ start &= pdmask;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= pdmask;
+ if (! ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ return;
+
+ for (i = 0; i < num_hugepd; i++, hpdp++)
+ hpdp->pd = 0;
- hpdp->pd = 0;
tlb->need_flush = 1;
- pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
- PGF_CACHENUM_MASK));
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ hugepd_free(tlb, hugepte);
+#else
+ pgtable_free_tlb(tlb, hugepte, pdshift - shift);
+#endif
}
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -181,13 +535,29 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
unsigned long start;
start = addr;
- pmd = pmd_offset(pud, addr);
do {
+ pmd = pmd_offset(pud, addr);
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd))
+ if (!is_hugepd(pmd)) {
+ /*
+ * if it is not hugepd pointer, we should already find
+ * it cleared.
+ */
+ WARN_ON(!pmd_none_or_clear_bad(pmd));
continue;
- free_hugepte_range(tlb, (hugepd_t *)pmd);
- } while (pmd++, addr = next, addr != end);
+ }
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ /*
+ * Increment next by the size of the huge mapping since
+ * there may be more than one entry at this level for a
+ * single hugepage, but all of them point to
+ * the same kmem cache that holds the hugepte.
+ */
+ next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
+#endif
+ free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
+ addr, next, floor, ceiling);
+ } while (addr = next, addr != end);
start &= PUD_MASK;
if (start < floor)
@@ -202,7 +572,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
pmd = pmd_offset(pud, start);
pud_clear(pud);
- pmd_free_tlb(tlb, pmd);
+ pmd_free_tlb(tlb, pmd, start);
}
static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -214,25 +584,28 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
unsigned long start;
start = addr;
- pud = pud_offset(pgd, addr);
do {
+ pud = pud_offset(pgd, addr);
next = pud_addr_end(addr, end);
-#ifdef CONFIG_PPC_64K_PAGES
- if (pud_none_or_clear_bad(pud))
- continue;
- hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
-#else
- if (HPAGE_SHIFT == HPAGE_SHIFT_64K) {
+ if (!is_hugepd(pud)) {
if (pud_none_or_clear_bad(pud))
continue;
- hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+ hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
+ ceiling);
} else {
- if (pud_none(*pud))
- continue;
- free_hugepte_range(tlb, (hugepd_t *)pud);
- }
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ /*
+ * Increment next by the size of the huge mapping since
+ * there may be more than one entry at this level for a
+ * single hugepage, but all of them point to
+ * the same kmem cache that holds the hugepte.
+ */
+ next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
#endif
- } while (pud++, addr = next, addr != end);
+ free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
+ addr, next, floor, ceiling);
+ }
+ } while (addr = next, addr != end);
start &= PGDIR_MASK;
if (start < floor)
@@ -247,103 +620,57 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
pud = pud_offset(pgd, start);
pgd_clear(pgd);
- pud_free_tlb(tlb, pud);
+ pud_free_tlb(tlb, pud, start);
}
/*
* This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
*/
-void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
pgd_t *pgd;
unsigned long next;
- unsigned long start;
/*
- * Comments below take from the normal free_pgd_range(). They
- * apply here too. The tests against HUGEPD_MASK below are
- * essential, because we *don't* test for this at the bottom
- * level. Without them we'll attempt to free a hugepte table
- * when we unmap just part of it, even if there are other
- * active mappings using it.
- *
- * The next few lines have given us lots of grief...
+ * Because there are a number of different possible pagetable
+ * layouts for hugepage ranges, we limit knowledge of how
+ * things should be laid out to the allocation path
+ * (huge_pte_alloc(), above). Everything else works out the
+ * structure as it goes from information in the hugepd
+ * pointers. That means that we can't here use the
+ * optimization used in the normal page free_pgd_range(), of
+ * checking whether we're actually covering a large enough
+ * range to have to do anything at the top level of the walk
+ * instead of at the bottom.
*
- * Why are we testing HUGEPD* at this top level? Because
- * often there will be no work to do at all, and we'd prefer
- * not to go all the way down to the bottom just to discover
- * that.
- *
- * Why all these "- 1"s? Because 0 represents both the bottom
- * of the address space and the top of it (using -1 for the
- * top wouldn't help much: the masks would do the wrong thing).
- * The rule is that addr 0 and floor 0 refer to the bottom of
- * the address space, but end 0 and ceiling 0 refer to the top
- * Comparisons need to use "end - 1" and "ceiling - 1" (though
- * that end 0 case should be mythical).
- *
- * Wherever addr is brought up or ceiling brought down, we
- * must be careful to reject "the opposite 0" before it
- * confuses the subsequent tests. But what about where end is
- * brought down by HUGEPD_SIZE below? no, end can't go down to
- * 0 there.
- *
- * Whereas we round start (addr) and ceiling down, by different
- * masks at different levels, in order to test whether a table
- * now has no other vmas using it, so can be freed, we don't
- * bother to round floor or end up - the tests don't need that.
+ * To make sense of this, you should probably go read the big
+ * block comment at the top of the normal free_pgd_range(),
+ * too.
*/
- addr &= HUGEPD_MASK;
- if (addr < floor) {
- addr += HUGEPD_SIZE;
- if (!addr)
- return;
- }
- if (ceiling) {
- ceiling &= HUGEPD_MASK;
- if (!ceiling)
- return;
- }
- if (end - 1 > ceiling - 1)
- end -= HUGEPD_SIZE;
- if (addr > end - 1)
- return;
-
- start = addr;
- pgd = pgd_offset((*tlb)->mm, addr);
do {
- BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- continue;
- hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
- } while (pgd++, addr = next, addr != end);
-}
-
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- if (pte_present(*ptep)) {
- /* We open-code pte_clear because we need to pass the right
- * argument to hpte_need_flush (huge / !huge). Might not be
- * necessary anymore if we make hpte_need_flush() get the
- * page size from the slices
- */
- pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
- }
- *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-}
-
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
- return __pte(old);
+ pgd = pgd_offset(tlb->mm, addr);
+ if (!is_hugepd(pgd)) {
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+ } else {
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ /*
+ * Increment next by the size of the huge mapping since
+ * there may be more than one entry at the pgd level
+ * for a single hugepage, but all of them point to the
+ * same kmem cache that holds the hugepte.
+ */
+ next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
+#endif
+ free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
+ addr, next, floor, ceiling);
+ }
+ } while (addr = next, addr != end);
}
struct page *
@@ -351,23 +678,26 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
pte_t *ptep;
struct page *page;
+ unsigned shift;
+ unsigned long mask;
+ /*
+ * Transparent hugepages are handled by generic code. We can skip them
+ * here.
+ */
+ ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
- if (get_slice_psize(mm, address) != mmu_huge_psize)
+ /* Verify it is a huge page else bail. */
+ if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
return ERR_PTR(-EINVAL);
- ptep = huge_pte_offset(mm, address);
+ mask = (1UL << shift) - 1;
page = pte_page(*ptep);
if (page)
- page += (address % HPAGE_SIZE) / PAGE_SIZE;
+ page += (address & mask) / PAGE_SIZE;
return page;
}
-int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
-
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
@@ -376,244 +706,384 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
return NULL;
}
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
+ unsigned long sz)
+{
+ unsigned long __boundary = (addr + sz) & ~(sz-1);
+ return (__boundary - 1 < end - 1) ? __boundary : end;
+}
+
+int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
+ unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ pte_t *ptep;
+ unsigned long sz = 1UL << hugepd_shift(*hugepd);
+ unsigned long next;
+
+ ptep = hugepte_offset(hugepd, addr, pdshift);
+ do {
+ next = hugepte_addr_end(addr, end, sz);
+ if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+ return 0;
+ } while (ptep++, addr = next, addr != end);
+
+ return 1;
+}
+#ifdef CONFIG_PPC_MM_SLICES
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- return slice_get_unmapped_area(addr, len, flags,
- mmu_huge_psize, 1, 0);
+ struct hstate *hstate = hstate_file(file);
+ int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
+
+ return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
}
+#endif
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
- pte_t pte, int trap)
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
- struct page *page;
- int i;
+#ifdef CONFIG_PPC_MM_SLICES
+ unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
- if (!pfn_valid(pte_pfn(pte)))
- return rflags;
+ return 1UL << mmu_psize_to_shift(psize);
+#else
+ if (!is_vm_hugetlb_page(vma))
+ return PAGE_SIZE;
- page = pte_page(pte);
+ return huge_page_size(hstate_vma(vma));
+#endif
+}
- /* page is dirty */
- if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
- if (trap == 0x400) {
- for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
- __flush_dcache_icache(page_address(page+i));
- set_bit(PG_arch_1, &page->flags);
- } else {
- rflags |= HPTE_R_N;
- }
- }
- return rflags;
+static inline bool is_power_of_4(unsigned long x)
+{
+ if (is_power_of_2(x))
+ return (__ilog2(x) % 2) ? false : true;
+ return false;
}
-int hash_huge_page(struct mm_struct *mm, unsigned long access,
- unsigned long ea, unsigned long vsid, int local,
- unsigned long trap)
+static int __init add_huge_page_size(unsigned long long size)
{
- pte_t *ptep;
- unsigned long old_pte, new_pte;
- unsigned long va, rflags, pa;
- long slot;
- int err = 1;
- int ssize = user_segment_size(ea);
+ int shift = __ffs(size);
+ int mmu_psize;
- ptep = huge_pte_offset(mm, ea);
+ /* Check that it is a page size supported by the hardware and
+ * that it fits within pagetable and slice limits. */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ if ((size < PAGE_SIZE) || !is_power_of_4(size))
+ return -EINVAL;
+#else
+ if (!is_power_of_2(size)
+ || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
+ return -EINVAL;
+#endif
- /* Search the Linux page table for a match with va */
- va = hpt_va(ea, vsid, ssize);
+ if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
+ return -EINVAL;
- /*
- * If no pte found or not present, send the problem up to
- * do_page_fault
+#ifdef CONFIG_SPU_FS_64K_LS
+ /* Disable support for 64K huge pages when 64K SPU local store
+ * support is enabled as the current implementation conflicts.
*/
- if (unlikely(!ptep || pte_none(*ptep)))
- goto out;
+ if (shift == PAGE_SHIFT_64K)
+ return -EINVAL;
+#endif /* CONFIG_SPU_FS_64K_LS */
- /*
- * Check the user's access rights to the page. If access should be
- * prevented then send the problem up to do_page_fault.
- */
- if (unlikely(access & ~pte_val(*ptep)))
- goto out;
- /*
- * At this point, we have a pte (old_pte) which can be used to build
- * or update an HPTE. There are 2 cases:
- *
- * 1. There is a valid (present) pte with no associated HPTE (this is
- * the most common case)
- * 2. There is a valid (present) pte with an associated HPTE. The
- * current values of the pp bits in the HPTE prevent access
- * because we are doing software DIRTY bit management and the
- * page is currently not DIRTY.
- */
+ BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
+ /* Return if huge page size has already been setup */
+ if (size_to_hstate(size))
+ return 0;
- do {
- old_pte = pte_val(*ptep);
- if (old_pte & _PAGE_BUSY)
- goto out;
- new_pte = old_pte | _PAGE_BUSY |
- _PAGE_ACCESSED | _PAGE_HASHPTE;
- } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
- old_pte, new_pte));
-
- rflags = 0x2 | (!(new_pte & _PAGE_RW));
- /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
- rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
- if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
- /* No CPU has hugepages but lacks no execute, so we
- * don't need to worry about that case */
- rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
- trap);
-
- /* Check if pte already has an hpte (case 2) */
- if (unlikely(old_pte & _PAGE_HASHPTE)) {
- /* There MIGHT be an HPTE for this pte */
- unsigned long hash, slot;
-
- hash = hpt_hash(va, HPAGE_SHIFT, ssize);
- if (old_pte & _PAGE_F_SECOND)
- hash = ~hash;
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += (old_pte & _PAGE_F_GIX) >> 12;
-
- if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
- ssize, local) == -1)
- old_pte &= ~_PAGE_HPTEFLAGS;
- }
+ hugetlb_add_hstate(shift - PAGE_SHIFT);
- if (likely(!(old_pte & _PAGE_HASHPTE))) {
- unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
- unsigned long hpte_group;
-
- pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-
-repeat:
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
-
- /* clear HPTE slot informations in new PTE */
- new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-
- /* Add in WIMG bits */
- /* XXX We should store these in the pte */
- /* --BenH: I think they are ... */
- rflags |= _PAGE_COHERENT;
-
- /* Insert into the hash table, primary slot */
- slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
- mmu_huge_psize, ssize);
-
- /* Primary is full, try the secondary */
- if (unlikely(slot == -1)) {
- hpte_group = ((~hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
- slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
- HPTE_V_SECONDARY,
- mmu_huge_psize, ssize);
- if (slot == -1) {
- if (mftb() & 0x1)
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP)&~0x7UL;
-
- ppc_md.hpte_remove(hpte_group);
- goto repeat;
- }
- }
+ return 0;
+}
+
+static int __init hugepage_setup_sz(char *str)
+{
+ unsigned long long size;
+
+ size = memparse(str, &str);
+
+ if (add_huge_page_size(size) != 0)
+ printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+
+ return 1;
+}
+__setup("hugepagesz=", hugepage_setup_sz);
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+struct kmem_cache *hugepte_cache;
+static int __init hugetlbpage_init(void)
+{
+ int psize;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ unsigned shift;
- if (unlikely(slot == -2))
- panic("hash_huge_page: pte_insert failed\n");
+ if (!mmu_psize_defs[psize].shift)
+ continue;
- new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+ shift = mmu_psize_to_shift(psize);
+
+ /* Don't treat normal page sizes as huge... */
+ if (shift != PAGE_SHIFT)
+ if (add_huge_page_size(1ULL << shift) < 0)
+ continue;
}
/*
- * No need to use ldarx/stdcx here
+ * Create a kmem cache for hugeptes. The bottom bits in the pte have
+ * size information encoded in them, so align them to allow this
*/
- *ptep = __pte(new_pte & ~_PAGE_BUSY);
+ hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
+ HUGEPD_SHIFT_MASK + 1, 0, NULL);
+ if (hugepte_cache == NULL)
+ panic("%s: Unable to create kmem cache for hugeptes\n",
+ __func__);
+
+ /* Default hpage size = 4M */
+ if (mmu_psize_defs[MMU_PAGE_4M].shift)
+ HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
+ else
+ panic("%s: Unable to set default huge page size\n", __func__);
- err = 0;
- out:
- return err;
+ return 0;
}
-
-void set_huge_psize(int psize)
-{
- /* Check that it is a page size supported by the hardware and
- * that it fits within pagetable limits. */
- if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
- (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
- mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
- HPAGE_SHIFT = mmu_psize_defs[psize].shift;
- mmu_huge_psize = psize;
-#ifdef CONFIG_PPC_64K_PAGES
- hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
#else
- if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
- hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
+static int __init hugetlbpage_init(void)
+{
+ int psize;
+
+ if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+ return -ENODEV;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ unsigned shift;
+ unsigned pdshift;
+
+ if (!mmu_psize_defs[psize].shift)
+ continue;
+
+ shift = mmu_psize_to_shift(psize);
+
+ if (add_huge_page_size(1ULL << shift) < 0)
+ continue;
+
+ if (shift < PMD_SHIFT)
+ pdshift = PMD_SHIFT;
+ else if (shift < PUD_SHIFT)
+ pdshift = PUD_SHIFT;
else
- hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
-#endif
+ pdshift = PGDIR_SHIFT;
+ /*
+ * if we have pdshift and shift value same, we don't
+ * use pgt cache for hugepd.
+ */
+ if (pdshift != shift) {
+ pgtable_cache_add(pdshift - shift, NULL);
+ if (!PGT_CACHE(pdshift - shift))
+ panic("hugetlbpage_init(): could not create "
+ "pgtable cache for %d bit pagesize\n", shift);
+ }
+ }
- } else
- HPAGE_SHIFT = 0;
+ /* Set default large page size. Currently, we pick 16M or 1M
+ * depending on what is available
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
+ else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+ HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
+
+ return 0;
}
+#endif
+module_init(hugetlbpage_init);
-static int __init hugepage_setup_sz(char *str)
+void flush_dcache_icache_hugepage(struct page *page)
{
- unsigned long long size;
- int mmu_psize = -1;
- int shift;
+ int i;
+ void *start;
- size = memparse(str, &str);
+ BUG_ON(!PageCompound(page));
- shift = __ffs(size);
- switch (shift) {
-#ifndef CONFIG_PPC_64K_PAGES
- case HPAGE_SHIFT_64K:
- mmu_psize = MMU_PAGE_64K;
- break;
-#endif
- case HPAGE_SHIFT_16M:
- mmu_psize = MMU_PAGE_16M;
- break;
+ for (i = 0; i < (1UL << compound_order(page)); i++) {
+ if (!PageHighMem(page)) {
+ __flush_dcache_icache(page_address(page+i));
+ } else {
+ start = kmap_atomic(page+i);
+ __flush_dcache_icache(start);
+ kunmap_atomic(start);
+ }
}
+}
- if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)
- set_huge_psize(mmu_psize);
- else
- printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+#endif /* CONFIG_HUGETLB_PAGE */
- return 1;
-}
-__setup("hugepagesz=", hugepage_setup_sz);
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page, bottom two bits != 00
+ * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
{
- memset(addr, 0, kmem_cache_size(cache));
+ pgd_t pgd, *pgdp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t *ret_pte;
+ hugepd_t *hpdp = NULL;
+ unsigned pdshift = PGDIR_SHIFT;
+
+ if (shift)
+ *shift = 0;
+
+ pgdp = pgdir + pgd_index(ea);
+ pgd = ACCESS_ONCE(*pgdp);
+ /*
+ * Always operate on the local stack value. This make sure the
+ * value don't get updated by a parallel THP split/collapse,
+ * page fault or a page unmap. The return pte_t * is still not
+ * stable. So should be checked there for above conditions.
+ */
+ if (pgd_none(pgd))
+ return NULL;
+ else if (pgd_huge(pgd)) {
+ ret_pte = (pte_t *) pgdp;
+ goto out;
+ } else if (is_hugepd(&pgd))
+ hpdp = (hugepd_t *)&pgd;
+ else {
+ /*
+ * Even if we end up with an unmap, the pgtable will not
+ * be freed, because we do an rcu free and here we are
+ * irq disabled
+ */
+ pdshift = PUD_SHIFT;
+ pudp = pud_offset(&pgd, ea);
+ pud = ACCESS_ONCE(*pudp);
+
+ if (pud_none(pud))
+ return NULL;
+ else if (pud_huge(pud)) {
+ ret_pte = (pte_t *) pudp;
+ goto out;
+ } else if (is_hugepd(&pud))
+ hpdp = (hugepd_t *)&pud;
+ else {
+ pdshift = PMD_SHIFT;
+ pmdp = pmd_offset(&pud, ea);
+ pmd = ACCESS_ONCE(*pmdp);
+ /*
+ * A hugepage collapse is captured by pmd_none, because
+ * it mark the pmd none and do a hpte invalidate.
+ *
+ * A hugepage split is captured by pmd_trans_splitting
+ * because we mark the pmd trans splitting and do a
+ * hpte invalidate
+ *
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ return NULL;
+
+ if (pmd_huge(pmd) || pmd_large(pmd)) {
+ ret_pte = (pte_t *) pmdp;
+ goto out;
+ } else if (is_hugepd(&pmd))
+ hpdp = (hugepd_t *)&pmd;
+ else
+ return pte_offset_kernel(&pmd, ea);
+ }
+ }
+ if (!hpdp)
+ return NULL;
+
+ ret_pte = hugepte_offset(hpdp, ea, pdshift);
+ pdshift = hugepd_shift(*hpdp);
+out:
+ if (shift)
+ *shift = pdshift;
+ return ret_pte;
}
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
-static int __init hugetlbpage_init(void)
+int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
{
- if (!cpu_has_feature(CPU_FTR_16M_PAGE))
- return -ENODEV;
+ unsigned long mask;
+ unsigned long pte_end;
+ struct page *head, *page, *tail;
+ pte_t pte;
+ int refs;
- huge_pgtable_cache = kmem_cache_create("hugepte_cache",
- HUGEPTE_TABLE_SIZE,
- HUGEPTE_TABLE_SIZE,
- 0,
- zero_ctor);
- if (! huge_pgtable_cache)
- panic("hugetlbpage_init(): could not create hugepte cache\n");
+ pte_end = (addr + sz) & ~(sz-1);
+ if (pte_end < end)
+ end = pte_end;
- return 0;
-}
+ pte = ACCESS_ONCE(*ptep);
+ mask = _PAGE_PRESENT | _PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
-module_init(hugetlbpage_init);
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * check for splitting here
+ */
+ if (pmd_trans_splitting(pte_pmd(pte)))
+ return 0;
+#endif
+
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+
+ page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+ tail = page;
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ /* Could be optimized better */
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ /*
+ * Any tail page need their mapcount reference taken before we
+ * return.
+ */
+ while (refs--) {
+ if (PageTail(tail))
+ get_huge_page_tail(tail);
+ tail++;
+ }
+
+ return 1;
+}
diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c
new file mode 100644
index 00000000000..915412e4d5b
--- /dev/null
+++ b/arch/powerpc/mm/icswx.c
@@ -0,0 +1,292 @@
+/*
+ * ICSWX and ACOP Management
+ *
+ * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#include "icswx.h"
+
+/*
+ * The processor and its L2 cache cause the icswx instruction to
+ * generate a COP_REQ transaction on PowerBus. The transaction has no
+ * address, and the processor does not perform an MMU access to
+ * authenticate the transaction. The command portion of the PowerBus
+ * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor
+ * Process ID (PID), which the coprocessor compares to the authorized
+ * LPID and PID held in the coprocessor, to determine if the process
+ * is authorized to generate the transaction. The data of the COP_REQ
+ * transaction is 128-byte or less in size and is placed in cacheable
+ * memory on a 128-byte cache line boundary.
+ *
+ * The task to use a coprocessor should use use_cop() to mark the use
+ * of the Coprocessor Type (CT) and context switching. On a server
+ * class processor, the PID register is used only for coprocessor
+ * management + * and so a coprocessor PID is allocated before
+ * executing icswx + * instruction. Drop_cop() is used to free the
+ * coprocessor PID.
+ *
+ * Example:
+ * Host Fabric Interface (HFI) is a PowerPC network coprocessor.
+ * Each HFI have multiple windows. Each HFI window serves as a
+ * network device sending to and receiving from HFI network.
+ * HFI immediate send function uses icswx instruction. The immediate
+ * send function allows small (single cache-line) packets be sent
+ * without using the regular HFI send FIFO and doorbell, which are
+ * much slower than immediate send.
+ *
+ * For each task intending to use HFI immediate send, the HFI driver
+ * calls use_cop() to obtain a coprocessor PID for the task.
+ * The HFI driver then allocate a free HFI window and save the
+ * coprocessor PID to the HFI window to allow the task to use the
+ * HFI window.
+ *
+ * The HFI driver repeatedly creates immediate send packets and
+ * issues icswx instruction to send data through the HFI window.
+ * The HFI compares the coprocessor PID in the CPU PID register
+ * to the PID held in the HFI window to determine if the transaction
+ * is allowed.
+ *
+ * When the task to release the HFI window, the HFI driver calls
+ * drop_cop() to release the coprocessor PID.
+ */
+
+void switch_cop(struct mm_struct *next)
+{
+#ifdef CONFIG_PPC_ICSWX_PID
+ mtspr(SPRN_PID, next->context.cop_pid);
+#endif
+ mtspr(SPRN_ACOP, next->context.acop);
+}
+
+/**
+ * Start using a coprocessor.
+ * @acop: mask of coprocessor to be used.
+ * @mm: The mm the coprocessor to associate with. Most likely current mm.
+ *
+ * Return a positive PID if successful. Negative errno otherwise.
+ * The returned PID will be fed to the coprocessor to determine if an
+ * icswx transaction is authenticated.
+ */
+int use_cop(unsigned long acop, struct mm_struct *mm)
+{
+ int ret;
+
+ if (!cpu_has_feature(CPU_FTR_ICSWX))
+ return -ENODEV;
+
+ if (!mm || !acop)
+ return -EINVAL;
+
+ /* The page_table_lock ensures mm_users won't change under us */
+ spin_lock(&mm->page_table_lock);
+ spin_lock(mm->context.cop_lockp);
+
+ ret = get_cop_pid(mm);
+ if (ret < 0)
+ goto out;
+
+ /* update acop */
+ mm->context.acop |= acop;
+
+ sync_cop(mm);
+
+ /*
+ * If this is a threaded process then there might be other threads
+ * running. We need to send an IPI to force them to pick up any
+ * change in PID and ACOP.
+ */
+ if (atomic_read(&mm->mm_users) > 1)
+ smp_call_function(sync_cop, mm, 1);
+
+out:
+ spin_unlock(mm->context.cop_lockp);
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(use_cop);
+
+/**
+ * Stop using a coprocessor.
+ * @acop: mask of coprocessor to be stopped.
+ * @mm: The mm the coprocessor associated with.
+ */
+void drop_cop(unsigned long acop, struct mm_struct *mm)
+{
+ int free_pid;
+
+ if (!cpu_has_feature(CPU_FTR_ICSWX))
+ return;
+
+ if (WARN_ON_ONCE(!mm))
+ return;
+
+ /* The page_table_lock ensures mm_users won't change under us */
+ spin_lock(&mm->page_table_lock);
+ spin_lock(mm->context.cop_lockp);
+
+ mm->context.acop &= ~acop;
+
+ free_pid = disable_cop_pid(mm);
+ sync_cop(mm);
+
+ /*
+ * If this is a threaded process then there might be other threads
+ * running. We need to send an IPI to force them to pick up any
+ * change in PID and ACOP.
+ */
+ if (atomic_read(&mm->mm_users) > 1)
+ smp_call_function(sync_cop, mm, 1);
+
+ if (free_pid != COP_PID_NONE)
+ free_cop_pid(free_pid);
+
+ spin_unlock(mm->context.cop_lockp);
+ spin_unlock(&mm->page_table_lock);
+}
+EXPORT_SYMBOL_GPL(drop_cop);
+
+static int acop_use_cop(int ct)
+{
+ /* There is no alternate policy, yet */
+ return -1;
+}
+
+/*
+ * Get the instruction word at the NIP
+ */
+static u32 acop_get_inst(struct pt_regs *regs)
+{
+ u32 inst;
+ u32 __user *p;
+
+ p = (u32 __user *)regs->nip;
+ if (!access_ok(VERIFY_READ, p, sizeof(*p)))
+ return 0;
+
+ if (__get_user(inst, p))
+ return 0;
+
+ return inst;
+}
+
+/**
+ * @regs: regsiters at time of interrupt
+ * @address: storage address
+ * @error_code: Fault code, usually the DSISR or ESR depending on
+ * processor type
+ *
+ * Return 0 if we are able to resolve the data storage fault that
+ * results from a CT miss in the ACOP register.
+ */
+int acop_handle_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ int ct;
+ u32 inst = 0;
+
+ if (!cpu_has_feature(CPU_FTR_ICSWX)) {
+ pr_info("No coprocessors available");
+ _exception(SIGILL, regs, ILL_ILLOPN, address);
+ }
+
+ if (!user_mode(regs)) {
+ /* this could happen if the HV denies the
+ * kernel access, for now we just die */
+ die("ICSWX from kernel failed", regs, SIGSEGV);
+ }
+
+ /* Some implementations leave us a hint for the CT */
+ ct = ICSWX_GET_CT_HINT(error_code);
+ if (ct < 0) {
+ /* we have to peek at the instruction word to figure out CT */
+ u32 ccw;
+ u32 rs;
+
+ inst = acop_get_inst(regs);
+ if (inst == 0)
+ return -1;
+
+ rs = (inst >> (31 - 10)) & 0x1f;
+ ccw = regs->gpr[rs];
+ ct = (ccw >> 16) & 0x3f;
+ }
+
+ /*
+ * We could be here because another thread has enabled acop
+ * but the ACOP register has yet to be updated.
+ *
+ * This should have been taken care of by the IPI to sync all
+ * the threads (see smp_call_function(sync_cop, mm, 1)), but
+ * that could take forever if there are a significant amount
+ * of threads.
+ *
+ * Given the number of threads on some of these systems,
+ * perhaps this is the best way to sync ACOP rather than whack
+ * every thread with an IPI.
+ */
+ if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) {
+ sync_cop(current->active_mm);
+ return 0;
+ }
+
+ /* check for alternate policy */
+ if (!acop_use_cop(ct))
+ return 0;
+
+ /* at this point the CT is unknown to the system */
+ pr_warn("%s[%d]: Coprocessor %d is unavailable\n",
+ current->comm, current->pid, ct);
+
+ /* get inst if we don't already have it */
+ if (inst == 0) {
+ inst = acop_get_inst(regs);
+ if (inst == 0)
+ return -1;
+ }
+
+ /* Check if the instruction is the "record form" */
+ if (inst & 1) {
+ /*
+ * the instruction is "record" form so we can reject
+ * using CR0
+ */
+ regs->ccr &= ~(0xful << 28);
+ regs->ccr |= ICSWX_RC_NOT_FOUND << 28;
+
+ /* Move on to the next instruction */
+ regs->nip += 4;
+ } else {
+ /*
+ * There is no architected mechanism to report a bad
+ * CT so we could either SIGILL or report nothing.
+ * Since the non-record version should only bu used
+ * for "hints" or "don't care" we should probably do
+ * nothing. However, I could see how some people
+ * might want an SIGILL so it here if you want it.
+ */
+#ifdef CONFIG_PPC_ICSWX_USE_SIGILL
+ _exception(SIGILL, regs, ILL_ILLOPN, address);
+#else
+ regs->nip += 4;
+#endif
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(acop_handle_fault);
diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h
new file mode 100644
index 00000000000..6dedc08e62c
--- /dev/null
+++ b/arch/powerpc/mm/icswx.h
@@ -0,0 +1,68 @@
+#ifndef _ARCH_POWERPC_MM_ICSWX_H_
+#define _ARCH_POWERPC_MM_ICSWX_H_
+
+/*
+ * ICSWX and ACOP Management
+ *
+ * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/mmu_context.h>
+
+/* also used to denote that PIDs are not used */
+#define COP_PID_NONE 0
+
+static inline void sync_cop(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (mm == current->active_mm)
+ switch_cop(current->active_mm);
+}
+
+#ifdef CONFIG_PPC_ICSWX_PID
+extern int get_cop_pid(struct mm_struct *mm);
+extern int disable_cop_pid(struct mm_struct *mm);
+extern void free_cop_pid(int free_pid);
+#else
+#define get_cop_pid(m) (COP_PID_NONE)
+#define disable_cop_pid(m) (COP_PID_NONE)
+#define free_cop_pid(p)
+#endif
+
+/*
+ * These are implementation bits for architected registers. If this
+ * ever becomes architecture the should be moved to reg.h et. al.
+ */
+/* UCT is the same bit for Server and Embedded */
+#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */
+
+#ifdef CONFIG_PPC_BOOK3E
+/* Embedded implementation gives us no hints as to what the CT is */
+#define ICSWX_GET_CT_HINT(x) (-1)
+#else
+/* Server implementation contains the CT value in the DSISR */
+#define ICSWX_DSISR_CTMASK 0x00003f00
+#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8)
+#endif
+
+#define ICSWX_RC_STARTED 0x8 /* The request has been started */
+#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */
+#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */
+#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */
+
+extern int acop_handle_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code);
+
+static inline u64 acop_copro_type_bit(unsigned int type)
+{
+ return 1ULL << (63 - type);
+}
+
+#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */
diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c
new file mode 100644
index 00000000000..91e30eb7d05
--- /dev/null
+++ b/arch/powerpc/mm/icswx_pid.c
@@ -0,0 +1,87 @@
+/*
+ * ICSWX and ACOP/PID Management
+ *
+ * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include "icswx.h"
+
+#define COP_PID_MIN (COP_PID_NONE + 1)
+#define COP_PID_MAX (0xFFFF)
+
+static DEFINE_SPINLOCK(mmu_context_acop_lock);
+static DEFINE_IDA(cop_ida);
+
+static int new_cop_pid(struct ida *ida, int min_id, int max_id,
+ spinlock_t *lock)
+{
+ int index;
+ int err;
+
+again:
+ if (!ida_pre_get(ida, GFP_KERNEL))
+ return -ENOMEM;
+
+ spin_lock(lock);
+ err = ida_get_new_above(ida, min_id, &index);
+ spin_unlock(lock);
+
+ if (err == -EAGAIN)
+ goto again;
+ else if (err)
+ return err;
+
+ if (index > max_id) {
+ spin_lock(lock);
+ ida_remove(ida, index);
+ spin_unlock(lock);
+ return -ENOMEM;
+ }
+
+ return index;
+}
+
+int get_cop_pid(struct mm_struct *mm)
+{
+ int pid;
+
+ if (mm->context.cop_pid == COP_PID_NONE) {
+ pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX,
+ &mmu_context_acop_lock);
+ if (pid >= 0)
+ mm->context.cop_pid = pid;
+ }
+ return mm->context.cop_pid;
+}
+
+int disable_cop_pid(struct mm_struct *mm)
+{
+ int free_pid = COP_PID_NONE;
+
+ if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) {
+ free_pid = mm->context.cop_pid;
+ mm->context.cop_pid = COP_PID_NONE;
+ }
+ return free_pid;
+}
+
+void free_cop_pid(int free_pid)
+{
+ spin_lock(&mmu_context_acop_lock);
+ ida_remove(&cop_ida, free_pid);
+ spin_unlock(&mmu_context_acop_lock);
+}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 977cb1ee5e7..cff59f1bec2 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -30,37 +30,48 @@
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/pagemap.h>
+#include <linux/memblock.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
-#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/btext.h>
#include <asm/tlb.h>
-#include <asm/lmb.h>
#include <asm/sections.h>
+#include <asm/hugetlb.h>
#include "mmu_decl.h"
#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
-/* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */
-#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - KERNELBASE))
-#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
+/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
+#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
+#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_KERNEL_START"
#endif
#endif
#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+phys_addr_t total_memory;
+phys_addr_t total_lowmem;
-unsigned long total_memory;
-unsigned long total_lowmem;
+phys_addr_t memstart_addr = (phys_addr_t)~0ull;
+EXPORT_SYMBOL(memstart_addr);
+phys_addr_t kernstart_addr;
+EXPORT_SYMBOL(kernstart_addr);
-unsigned long ppc_memstart;
-unsigned long ppc_memoffset = PAGE_OFFSET;
+#ifdef CONFIG_RELOCATABLE_PPC32
+/* Used in __va()/__pa() */
+long long virt_phys_offset;
+EXPORT_SYMBOL(virt_phys_offset);
+#endif
+
+phys_addr_t lowmem_end_addr;
int boot_mapsize;
#ifdef CONFIG_PPC_PMAC
@@ -68,21 +79,11 @@ unsigned long agp_special_page;
EXPORT_SYMBOL(agp_special_page);
#endif
-#ifdef CONFIG_HIGHMEM
-pte_t *kmap_pte;
-pgprot_t kmap_prot;
-
-EXPORT_SYMBOL(kmap_prot);
-EXPORT_SYMBOL(kmap_pte);
-#endif
-
void MMU_init(void);
/* XXX should be in current.h -- paulus */
extern struct task_struct *current_set[NR_CPUS];
-extern int init_bootmem_done;
-
/*
* this tells the system to map all of ram with the segregs
* (i.e. page tables) instead of the bats.
@@ -91,14 +92,13 @@ extern int init_bootmem_done;
int __map_without_bats;
int __map_without_ltlbs;
-/* max amount of low RAM to map in */
-unsigned long __max_low_memory = MAX_LOW_MEM;
-
/*
- * limit of what is accessible with initial MMU setup -
- * 256MB usually, but only 16MB on 601.
+ * This tells the system to allow ioremapping memory marked as reserved.
*/
-unsigned long __initial_memory_limit = 0x10000000;
+int __allow_ioremap_reserved;
+
+/* max amount of low RAM to map in */
+unsigned long __max_low_memory = MAX_LOW_MEM;
/*
* Check for command-line options that affect what MMU_init will do.
@@ -129,24 +129,26 @@ void __init MMU_init(void)
if (ppc_md.progress)
ppc_md.progress("MMU:enter", 0x111);
- /* 601 can only access 16MB at the moment */
- if (PVR_VER(mfspr(SPRN_PVR)) == 1)
- __initial_memory_limit = 0x01000000;
- /* 8xx can only access 8MB at the moment */
- if (PVR_VER(mfspr(SPRN_PVR)) == 0x50)
- __initial_memory_limit = 0x00800000;
-
/* parse args from command line */
MMU_setup();
- if (lmb.memory.cnt > 1) {
- lmb.memory.cnt = 1;
- lmb_analyze();
+ /*
+ * Reserve gigantic pages for hugetlb. This MUST occur before
+ * lowmem_end_addr is initialized below.
+ */
+ reserve_hugetlb_gpages();
+
+ if (memblock.memory.cnt > 1) {
+#ifndef CONFIG_WII
+ memblock_enforce_memory_limit(memblock.memory.regions[0].size);
printk(KERN_WARNING "Only using first contiguous memory region");
+#else
+ wii_memory_fixups();
+#endif
}
- total_memory = lmb_end_of_DRAM();
- total_lowmem = total_memory;
+ total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr;
+ lowmem_end_addr = memstart_addr + total_lowmem;
#ifdef CONFIG_FSL_BOOKE
/* Freescale Book-E parts expect lowmem to be mapped by fixed TLB
@@ -157,10 +159,10 @@ void __init MMU_init(void)
if (total_lowmem > __max_low_memory) {
total_lowmem = __max_low_memory;
+ lowmem_end_addr = memstart_addr + total_lowmem;
#ifndef CONFIG_HIGHMEM
total_memory = total_lowmem;
- lmb_enforce_memory_limit(total_lowmem);
- lmb_analyze();
+ memblock_enforce_memory_limit(total_lowmem);
#endif /* CONFIG_HIGHMEM */
}
@@ -174,21 +176,12 @@ void __init MMU_init(void)
ppc_md.progress("MMU:mapin", 0x301);
mapin_ram();
-#ifdef CONFIG_HIGHMEM
- ioremap_base = PKMAP_BASE;
-#else
- ioremap_base = 0xfe000000UL; /* for now, could be 0xfffff000 */
-#endif /* CONFIG_HIGHMEM */
- ioremap_bot = ioremap_base;
+ /* Initialize early top-down ioremap allocator */
+ ioremap_bot = IOREMAP_TOP;
/* Map in I/O resources */
if (ppc_md.progress)
ppc_md.progress("MMU:setio", 0x302);
- if (ppc_md.setup_io_mappings)
- ppc_md.setup_io_mappings();
-
- /* Initialize the context management stuff */
- mmu_context_init();
if (ppc_md.progress)
ppc_md.progress("MMU:exit", 0x211);
@@ -197,101 +190,35 @@ void __init MMU_init(void)
#ifdef CONFIG_BOOTX_TEXT
btext_unmap();
#endif
+
+ /* Shortly after that, the entire linear mapping will be available */
+ memblock_set_current_limit(lowmem_end_addr);
}
/* This is only called until mem_init is done. */
void __init *early_get_page(void)
{
- void *p;
-
- if (init_bootmem_done) {
- p = alloc_bootmem_pages(PAGE_SIZE);
- } else {
- p = __va(lmb_alloc_base(PAGE_SIZE, PAGE_SIZE,
- __initial_memory_limit));
- }
- return p;
-}
-
-/* Free up now-unused memory */
-static void free_sec(unsigned long start, unsigned long end, const char *name)
-{
- unsigned long cnt = 0;
-
- while (start < end) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- cnt++;
- start += PAGE_SIZE;
- }
- if (cnt) {
- printk(" %ldk %s", cnt << (PAGE_SHIFT - 10), name);
- totalram_pages += cnt;
- }
-}
-
-void free_initmem(void)
-{
-#define FREESEC(TYPE) \
- free_sec((unsigned long)(&__ ## TYPE ## _begin), \
- (unsigned long)(&__ ## TYPE ## _end), \
- #TYPE);
-
- printk ("Freeing unused kernel memory:");
- FREESEC(init);
- printk("\n");
- ppc_md.progress = NULL;
-#undef FREESEC
+ if (init_bootmem_done)
+ return alloc_bootmem_pages(PAGE_SIZE);
+ else
+ return __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
+#ifdef CONFIG_8xx /* No 8xx specific .c file to put that in ... */
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
{
- if (start < end)
- printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
-}
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+#ifdef CONFIG_PIN_TLB
+ /* 8xx can only access 24MB at the moment */
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000));
+#else
+ /* 8xx can only access 8MB at the moment */
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
#endif
-
-#ifdef CONFIG_PROC_KCORE
-static struct kcore_list kcore_vmem;
-
-static int __init setup_kcore(void)
-{
- int i;
-
- for (i = 0; i < lmb.memory.cnt; i++) {
- unsigned long base;
- unsigned long size;
- struct kcore_list *kcore_mem;
-
- base = lmb.memory.region[i].base;
- size = lmb.memory.region[i].size;
-
- kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
- if (!kcore_mem)
- panic("%s: kmalloc failed\n", __FUNCTION__);
-
- /* must stay under 32 bits */
- if ( 0xfffffffful - (unsigned long)__va(base) < size) {
- size = 0xfffffffful - (unsigned long)(__va(base));
- printk(KERN_DEBUG "setup_kcore: restrict size=%lx\n",
- size);
- }
-
- kclist_add(kcore_mem, __va(base), size);
- }
-
- kclist_add(&kcore_vmem, (void *)VMALLOC_START,
- VMALLOC_END-VMALLOC_START);
-
- return 0;
}
-module_init(setup_kcore);
-#endif
+#endif /* CONFIG_8xx */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index c0f5cff7703..e3734edffa6 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -19,6 +19,8 @@
*
*/
+#undef DEBUG
+
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -38,11 +40,13 @@
#include <linux/nodemask.h>
#include <linux/module.h>
#include <linux/poison.h>
+#include <linux/memblock.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
-#include <asm/lmb.h>
#include <asm/rtas.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -57,13 +61,12 @@
#include <asm/mmzone.h>
#include <asm/cputable.h>
#include <asm/sections.h>
-#include <asm/system.h>
#include <asm/iommu.h>
-#include <asm/abs_addr.h>
#include <asm/vdso.h>
#include "mmu_decl.h"
+#ifdef CONFIG_PPC_STD_MMU_64
#if PGTABLE_RANGE > USER_VSID_RANGE
#warning Limited user VSID range means pagetable space is wasted
#endif
@@ -71,109 +74,82 @@
#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
#warning TASK_SIZE is smaller than it needs to be.
#endif
+#endif /* CONFIG_PPC_STD_MMU_64 */
-/* max amount of RAM to use */
-unsigned long __max_memory;
+phys_addr_t memstart_addr = ~0;
+EXPORT_SYMBOL_GPL(memstart_addr);
+phys_addr_t kernstart_addr;
+EXPORT_SYMBOL_GPL(kernstart_addr);
-void free_initmem(void)
+static void pgd_ctor(void *addr)
{
- unsigned long addr;
-
- addr = (unsigned long)__init_begin;
- for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- printk ("Freeing unused kernel memory: %luk freed\n",
- ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
+ memset(addr, 0, PGD_TABLE_SIZE);
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
+static void pmd_ctor(void *addr)
{
- if (start < end)
- printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
-}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ memset(addr, 0, PMD_TABLE_SIZE * 2);
+#else
+ memset(addr, 0, PMD_TABLE_SIZE);
#endif
-
-#ifdef CONFIG_PROC_KCORE
-static struct kcore_list kcore_vmem;
-
-static int __init setup_kcore(void)
-{
- int i;
-
- for (i=0; i < lmb.memory.cnt; i++) {
- unsigned long base, size;
- struct kcore_list *kcore_mem;
-
- base = lmb.memory.region[i].base;
- size = lmb.memory.region[i].size;
-
- /* GFP_ATOMIC to avoid might_sleep warnings during boot */
- kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
- if (!kcore_mem)
- panic("%s: kmalloc failed\n", __FUNCTION__);
-
- kclist_add(kcore_mem, __va(base), size);
- }
-
- kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
-
- return 0;
}
-module_init(setup_kcore);
-#endif
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+
+/*
+ * Create a kmem_cache() for pagetables. This is not used for PTE
+ * pages - they're linked to struct page, come from the normal free
+ * pages pool and have a different entry size (see real_pte_t) to
+ * everything else. Caches created by this function are used for all
+ * the higher level pagetables, and for hugepage pagetables.
+ */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
{
- memset(addr, 0, kmem_cache_size(cache));
+ char *name;
+ unsigned long table_size = sizeof(void *) << shift;
+ unsigned long align = table_size;
+
+ /* When batching pgtable pointers for RCU freeing, we store
+ * the index size in the low bits. Table alignment must be
+ * big enough to fit it.
+ *
+ * Likewise, hugeapge pagetable pointers contain a (different)
+ * shift value in the low bits. All tables must be aligned so
+ * as to leave enough 0 bits in the address to contain it. */
+ unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
+ HUGEPD_SHIFT_MASK + 1);
+ struct kmem_cache *new;
+
+ /* It would be nice if this was a BUILD_BUG_ON(), but at the
+ * moment, gcc doesn't seem to recognize is_power_of_2 as a
+ * constant expression, so so much for that. */
+ BUG_ON(!is_power_of_2(minalign));
+ BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+
+ if (PGT_CACHE(shift))
+ return; /* Already have a cache of this size */
+
+ align = max_t(unsigned long, align, minalign);
+ name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+ new = kmem_cache_create(name, table_size, align, 0, ctor);
+ pgtable_cache[shift - 1] = new;
+ pr_debug("Allocated pgtable cache for order %d\n", shift);
}
-static const unsigned int pgtable_cache_size[2] = {
- PGD_TABLE_SIZE, PMD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-#ifdef CONFIG_PPC_64K_PAGES
- "pgd_cache", "pmd_cache",
-#else
- "pgd_cache", "pud_pmd_cache",
-#endif /* CONFIG_PPC_64K_PAGES */
-};
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need one extra cache, initialized in hugetlbpage.c. We
- * can't put into the tables above, because HPAGE_SHIFT is not compile
- * time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
-#else
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
-#endif
void pgtable_cache_init(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
- int size = pgtable_cache_size[i];
- const char *name = pgtable_cache_name[i];
-
- pr_debug("Allocating page table cache %s (#%d) "
- "for size: %08x...\n", name, i, size);
- pgtable_cache[i] = kmem_cache_create(name,
- size, size,
- SLAB_PANIC,
- zero_ctor);
- }
+ pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+ pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+ if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
+ panic("Couldn't allocate pgtable caches");
+ /* In all current configs, when the PUD index exists it's the
+ * same size as either the pgd or pmd index. Verify that the
+ * initialization above has also created a PUD cache. This
+ * will need re-examiniation if we add new possibilities for
+ * the pagetable layout. */
+ BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
}
#ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -183,7 +159,7 @@ void pgtable_cache_init(void)
* do this by hand as the proffered address may not be correctly aligned.
* Subtraction of non-aligned pointers produces undefined results.
*/
-unsigned long __meminit vmemmap_section_start(unsigned long page)
+static unsigned long __meminit vmemmap_section_start(unsigned long page)
{
unsigned long offset = page - ((unsigned long)(vmemmap));
@@ -196,7 +172,7 @@ unsigned long __meminit vmemmap_section_start(unsigned long page)
* which overlaps this vmemmap page is initialised then this page is
* initialised already.
*/
-int __meminit vmemmap_populated(unsigned long start, int page_size)
+static int __meminit vmemmap_populated(unsigned long start, int page_size)
{
unsigned long end = start + page_size;
@@ -207,21 +183,99 @@ int __meminit vmemmap_populated(unsigned long start, int page_size)
return 0;
}
-int __meminit vmemmap_populate(struct page *start_page,
- unsigned long nr_pages, int node)
+/* On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ * On Book3E CPUs, the vmemmap is currently mapped in the top half of
+ * the vmalloc space using normal page tables, though the size of
+ * pages encoded in the PTEs can be different
+ */
+
+#ifdef CONFIG_PPC_BOOK3E
+static void __meminit vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ /* Create a PTE encoding without page size */
+ unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
+ _PAGE_KERNEL_RW;
+
+ /* PTEs only contain page size encodings up to 32M */
+ BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+
+ /* Encode the size in the PTE */
+ flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+
+ /* For each PTE for that area, map things. Note that we don't
+ * increment phys because all PTEs are of the large size and
+ * thus must have the low bits clear
+ */
+ for (i = 0; i < page_size; i += PAGE_SIZE)
+ BUG_ON(map_kernel_page(start + i, phys, flags));
+}
+#else /* CONFIG_PPC_BOOK3E */
+static void __meminit vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ int mapped = htab_bolt_mapping(start, start + page_size, phys,
+ pgprot_val(PAGE_KERNEL),
+ mmu_vmemmap_psize,
+ mmu_kernel_ssize);
+ BUG_ON(mapped < 0);
+}
+#endif /* CONFIG_PPC_BOOK3E */
+
+struct vmemmap_backing *vmemmap_list;
+
+static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
{
- unsigned long mode_rw;
- unsigned long start = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + nr_pages);
- unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift;
+ static struct vmemmap_backing *next;
+ static int num_left;
+
+ /* allocate a page when required and hand out chunks */
+ if (!next || !num_left) {
+ next = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (unlikely(!next)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ num_left = PAGE_SIZE / sizeof(struct vmemmap_backing);
+ }
- mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+ num_left--;
+
+ return next++;
+}
+
+static __meminit void vmemmap_list_populate(unsigned long phys,
+ unsigned long start,
+ int node)
+{
+ struct vmemmap_backing *vmem_back;
+
+ vmem_back = vmemmap_list_alloc(node);
+ if (unlikely(!vmem_back)) {
+ WARN_ON(1);
+ return;
+ }
+
+ vmem_back->phys = phys;
+ vmem_back->virt_addr = start;
+ vmem_back->list = vmemmap_list;
+
+ vmemmap_list = vmem_back;
+}
+
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+{
+ unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
/* Align to the page size of the linear mapping. */
start = _ALIGN_DOWN(start, page_size);
+ pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
+
for (; start < end; start += page_size) {
- int mapped;
void *p;
if (vmemmap_populated(start, page_size))
@@ -231,15 +285,73 @@ int __meminit vmemmap_populate(struct page *start_page,
if (!p)
return -ENOMEM;
- pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n",
- start, p, __pa(p));
+ vmemmap_list_populate(__pa(p), start, node);
- mapped = htab_bolt_mapping(start, start + page_size,
- __pa(p), mode_rw, mmu_linear_psize,
- mmu_kernel_ssize);
- BUG_ON(mapped < 0);
+ pr_debug(" * %016lx..%016lx allocated at %p\n",
+ start, start + page_size, p);
+
+ vmemmap_create_mapping(start, page_size, __pa(p));
}
return 0;
}
-#endif
+
+void vmemmap_free(unsigned long start, unsigned long end)
+{
+}
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+}
+
+/*
+ * We do not have access to the sparsemem vmemmap, so we fallback to
+ * walking the list of sparsemem blocks which we already maintain for
+ * the sake of crashdump. In the long run, we might want to maintain
+ * a tree if performance of that linear walk becomes a problem.
+ *
+ * realmode_pfn_to_page functions can fail due to:
+ * 1) As real sparsemem blocks do not lay in RAM continously (they
+ * are in virtual address space which is not available in the real mode),
+ * the requested page struct can be split between blocks so get_page/put_page
+ * may fail.
+ * 2) When huge pages are used, the get_page/put_page API will fail
+ * in real mode as the linked addresses in the page struct are virtual
+ * too.
+ */
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+ struct vmemmap_backing *vmem_back;
+ struct page *page;
+ unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
+ unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
+
+ for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
+ if (pg_va < vmem_back->virt_addr)
+ continue;
+
+ /* Check that page struct is not split between real pages */
+ if ((pg_va + sizeof(struct page)) >
+ (vmem_back->virt_addr + page_size))
+ return NULL;
+
+ page = (struct page *) (vmem_back->phys + pg_va -
+ vmem_back->virt_addr);
+ return page;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#elif defined(CONFIG_FLATMEM)
+
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+ return page;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
diff --git a/arch/powerpc/mm/lmb.c b/arch/powerpc/mm/lmb.c
deleted file mode 100644
index 4ce23bcf8a5..00000000000
--- a/arch/powerpc/mm/lmb.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * Procedures for maintaining information about logical memory blocks.
- *
- * Peter Bergner, IBM Corp. June 2001.
- * Copyright (C) 2001 Peter Bergner.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/bitops.h>
-#include <asm/types.h>
-#include <asm/page.h>
-#include <asm/prom.h>
-#include <asm/lmb.h>
-#ifdef CONFIG_PPC32
-#include "mmu_decl.h" /* for __max_low_memory */
-#endif
-
-#undef DEBUG
-
-#ifdef DEBUG
-#include <asm/udbg.h>
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
-#define LMB_ALLOC_ANYWHERE 0
-
-struct lmb lmb;
-
-void lmb_dump_all(void)
-{
-#ifdef DEBUG
- unsigned long i;
-
- DBG("lmb_dump_all:\n");
- DBG(" memory.cnt = 0x%lx\n", lmb.memory.cnt);
- DBG(" memory.size = 0x%lx\n", lmb.memory.size);
- for (i=0; i < lmb.memory.cnt ;i++) {
- DBG(" memory.region[0x%x].base = 0x%lx\n",
- i, lmb.memory.region[i].base);
- DBG(" .size = 0x%lx\n",
- lmb.memory.region[i].size);
- }
-
- DBG("\n reserved.cnt = 0x%lx\n", lmb.reserved.cnt);
- DBG(" reserved.size = 0x%lx\n", lmb.reserved.size);
- for (i=0; i < lmb.reserved.cnt ;i++) {
- DBG(" reserved.region[0x%x].base = 0x%lx\n",
- i, lmb.reserved.region[i].base);
- DBG(" .size = 0x%lx\n",
- lmb.reserved.region[i].size);
- }
-#endif /* DEBUG */
-}
-
-static unsigned long __init lmb_addrs_overlap(unsigned long base1,
- unsigned long size1, unsigned long base2, unsigned long size2)
-{
- return ((base1 < (base2+size2)) && (base2 < (base1+size1)));
-}
-
-static long __init lmb_addrs_adjacent(unsigned long base1, unsigned long size1,
- unsigned long base2, unsigned long size2)
-{
- if (base2 == base1 + size1)
- return 1;
- else if (base1 == base2 + size2)
- return -1;
-
- return 0;
-}
-
-static long __init lmb_regions_adjacent(struct lmb_region *rgn,
- unsigned long r1, unsigned long r2)
-{
- unsigned long base1 = rgn->region[r1].base;
- unsigned long size1 = rgn->region[r1].size;
- unsigned long base2 = rgn->region[r2].base;
- unsigned long size2 = rgn->region[r2].size;
-
- return lmb_addrs_adjacent(base1, size1, base2, size2);
-}
-
-static void __init lmb_remove_region(struct lmb_region *rgn, unsigned long r)
-{
- unsigned long i;
-
- for (i = r; i < rgn->cnt - 1; i++) {
- rgn->region[i].base = rgn->region[i + 1].base;
- rgn->region[i].size = rgn->region[i + 1].size;
- }
- rgn->cnt--;
-}
-
-/* Assumption: base addr of region 1 < base addr of region 2 */
-static void __init lmb_coalesce_regions(struct lmb_region *rgn,
- unsigned long r1, unsigned long r2)
-{
- rgn->region[r1].size += rgn->region[r2].size;
- lmb_remove_region(rgn, r2);
-}
-
-/* This routine called with relocation disabled. */
-void __init lmb_init(void)
-{
- /* Create a dummy zero size LMB which will get coalesced away later.
- * This simplifies the lmb_add() code below...
- */
- lmb.memory.region[0].base = 0;
- lmb.memory.region[0].size = 0;
- lmb.memory.cnt = 1;
-
- /* Ditto. */
- lmb.reserved.region[0].base = 0;
- lmb.reserved.region[0].size = 0;
- lmb.reserved.cnt = 1;
-}
-
-/* This routine may be called with relocation disabled. */
-void __init lmb_analyze(void)
-{
- int i;
-
- lmb.memory.size = 0;
-
- for (i = 0; i < lmb.memory.cnt; i++)
- lmb.memory.size += lmb.memory.region[i].size;
-}
-
-/* This routine called with relocation disabled. */
-static long __init lmb_add_region(struct lmb_region *rgn, unsigned long base,
- unsigned long size)
-{
- unsigned long coalesced = 0;
- long adjacent, i;
-
- /* First try and coalesce this LMB with another. */
- for (i=0; i < rgn->cnt; i++) {
- unsigned long rgnbase = rgn->region[i].base;
- unsigned long rgnsize = rgn->region[i].size;
-
- if ((rgnbase == base) && (rgnsize == size))
- /* Already have this region, so we're done */
- return 0;
-
- adjacent = lmb_addrs_adjacent(base,size,rgnbase,rgnsize);
- if ( adjacent > 0 ) {
- rgn->region[i].base -= size;
- rgn->region[i].size += size;
- coalesced++;
- break;
- }
- else if ( adjacent < 0 ) {
- rgn->region[i].size += size;
- coalesced++;
- break;
- }
- }
-
- if ((i < rgn->cnt-1) && lmb_regions_adjacent(rgn, i, i+1) ) {
- lmb_coalesce_regions(rgn, i, i+1);
- coalesced++;
- }
-
- if (coalesced)
- return coalesced;
- if (rgn->cnt >= MAX_LMB_REGIONS)
- return -1;
-
- /* Couldn't coalesce the LMB, so add it to the sorted table. */
- for (i = rgn->cnt-1; i >= 0; i--) {
- if (base < rgn->region[i].base) {
- rgn->region[i+1].base = rgn->region[i].base;
- rgn->region[i+1].size = rgn->region[i].size;
- } else {
- rgn->region[i+1].base = base;
- rgn->region[i+1].size = size;
- break;
- }
- }
- rgn->cnt++;
-
- return 0;
-}
-
-/* This routine may be called with relocation disabled. */
-long __init lmb_add(unsigned long base, unsigned long size)
-{
- struct lmb_region *_rgn = &(lmb.memory);
-
- /* On pSeries LPAR systems, the first LMB is our RMO region. */
- if (base == 0)
- lmb.rmo_size = size;
-
- return lmb_add_region(_rgn, base, size);
-
-}
-
-long __init lmb_reserve(unsigned long base, unsigned long size)
-{
- struct lmb_region *_rgn = &(lmb.reserved);
-
- BUG_ON(0 == size);
-
- return lmb_add_region(_rgn, base, size);
-}
-
-long __init lmb_overlaps_region(struct lmb_region *rgn, unsigned long base,
- unsigned long size)
-{
- unsigned long i;
-
- for (i=0; i < rgn->cnt; i++) {
- unsigned long rgnbase = rgn->region[i].base;
- unsigned long rgnsize = rgn->region[i].size;
- if ( lmb_addrs_overlap(base,size,rgnbase,rgnsize) ) {
- break;
- }
- }
-
- return (i < rgn->cnt) ? i : -1;
-}
-
-unsigned long __init lmb_alloc(unsigned long size, unsigned long align)
-{
- return lmb_alloc_base(size, align, LMB_ALLOC_ANYWHERE);
-}
-
-unsigned long __init lmb_alloc_base(unsigned long size, unsigned long align,
- unsigned long max_addr)
-{
- unsigned long alloc;
-
- alloc = __lmb_alloc_base(size, align, max_addr);
-
- if (alloc == 0)
- panic("ERROR: Failed to allocate 0x%lx bytes below 0x%lx.\n",
- size, max_addr);
-
- return alloc;
-}
-
-unsigned long __init __lmb_alloc_base(unsigned long size, unsigned long align,
- unsigned long max_addr)
-{
- long i, j;
- unsigned long base = 0;
-
- BUG_ON(0 == size);
-
-#ifdef CONFIG_PPC32
- /* On 32-bit, make sure we allocate lowmem */
- if (max_addr == LMB_ALLOC_ANYWHERE)
- max_addr = __max_low_memory;
-#endif
- for (i = lmb.memory.cnt-1; i >= 0; i--) {
- unsigned long lmbbase = lmb.memory.region[i].base;
- unsigned long lmbsize = lmb.memory.region[i].size;
-
- if (max_addr == LMB_ALLOC_ANYWHERE)
- base = _ALIGN_DOWN(lmbbase + lmbsize - size, align);
- else if (lmbbase < max_addr) {
- base = min(lmbbase + lmbsize, max_addr);
- base = _ALIGN_DOWN(base - size, align);
- } else
- continue;
-
- while ((lmbbase <= base) &&
- ((j = lmb_overlaps_region(&lmb.reserved, base, size)) >= 0) )
- base = _ALIGN_DOWN(lmb.reserved.region[j].base - size,
- align);
-
- if ((base != 0) && (lmbbase <= base))
- break;
- }
-
- if (i < 0)
- return 0;
-
- lmb_add_region(&lmb.reserved, base, size);
-
- return base;
-}
-
-/* You must call lmb_analyze() before this. */
-unsigned long __init lmb_phys_mem_size(void)
-{
- return lmb.memory.size;
-}
-
-unsigned long __init lmb_end_of_DRAM(void)
-{
- int idx = lmb.memory.cnt - 1;
-
- return (lmb.memory.region[idx].base + lmb.memory.region[idx].size);
-}
-
-/* You must call lmb_analyze() after this. */
-void __init lmb_enforce_memory_limit(unsigned long memory_limit)
-{
- unsigned long i, limit;
- struct lmb_property *p;
-
- if (! memory_limit)
- return;
-
- /* Truncate the lmb regions to satisfy the memory limit. */
- limit = memory_limit;
- for (i = 0; i < lmb.memory.cnt; i++) {
- if (limit > lmb.memory.region[i].size) {
- limit -= lmb.memory.region[i].size;
- continue;
- }
-
- lmb.memory.region[i].size = limit;
- lmb.memory.cnt = i + 1;
- break;
- }
-
- if (lmb.memory.region[0].size < lmb.rmo_size)
- lmb.rmo_size = lmb.memory.region[0].size;
-
- /* And truncate any reserves above the limit also. */
- for (i = 0; i < lmb.reserved.cnt; i++) {
- p = &lmb.reserved.region[i];
-
- if (p->base > memory_limit)
- p->size = 0;
- else if ((p->base + p->size) > memory_limit)
- p->size = memory_limit - p->base;
-
- if (p->size == 0) {
- lmb_remove_region(&lmb.reserved, i);
- i--;
- }
- }
-}
-
-int __init lmb_is_reserved(unsigned long addr)
-{
- int i;
-
- for (i = 0; i < lmb.reserved.cnt; i++) {
- unsigned long upper = lmb.reserved.region[i].base +
- lmb.reserved.region[i].size - 1;
- if ((addr >= lmb.reserved.region[i].base) && (addr <= upper))
- return 1;
- }
- return 0;
-}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index be5c506779a..2c8e90f5789 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -17,11 +17,12 @@
*
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
+#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/stddef.h>
@@ -31,6 +32,9 @@
#include <linux/initrd.h>
#include <linux/pagemap.h>
#include <linux/suspend.h>
+#include <linux/memblock.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
#include <asm/pgalloc.h>
#include <asm/prom.h>
@@ -42,9 +46,12 @@
#include <asm/machdep.h>
#include <asm/btext.h>
#include <asm/tlb.h>
-#include <asm/lmb.h>
#include <asm/sections.h>
+#include <asm/sparsemem.h>
#include <asm/vdso.h>
+#include <asm/fixmap.h>
+#include <asm/swiotlb.h>
+#include <asm/rtas.h>
#include "mmu_decl.h"
@@ -55,27 +62,32 @@
int init_bootmem_done;
int mem_init_done;
-unsigned long memory_limit;
+unsigned long long memory_limit;
-int page_is_ram(unsigned long pfn)
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+EXPORT_SYMBOL(kmap_pte);
+pgprot_t kmap_prot;
+EXPORT_SYMBOL(kmap_prot);
+
+static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
- unsigned long paddr = (pfn << PAGE_SHIFT);
+ return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+ vaddr), vaddr), vaddr);
+}
+#endif
+int page_is_ram(unsigned long pfn)
+{
#ifndef CONFIG_PPC64 /* XXX for now */
- return paddr < __pa(high_memory);
+ return pfn < max_pfn;
#else
- int i;
- for (i=0; i < lmb.memory.cnt; i++) {
- unsigned long base;
-
- base = lmb.memory.region[i].base;
+ unsigned long paddr = (pfn << PAGE_SHIFT);
+ struct memblock_region *reg;
- if ((paddr >= base) &&
- (paddr < (base + lmb.memory.region[i].size))) {
+ for_each_memblock(memory, reg)
+ if (paddr >= reg->base && paddr < (reg->base + reg->size))
return 1;
- }
- }
-
return 0;
#endif
}
@@ -87,23 +99,14 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot);
if (!page_is_ram(pfn))
- vma_prot = __pgprot(pgprot_val(vma_prot)
- | _PAGE_GUARDED | _PAGE_NO_CACHE);
+ vma_prot = pgprot_noncached(vma_prot);
+
return vma_prot;
}
EXPORT_SYMBOL(phys_mem_access_prot);
#ifdef CONFIG_MEMORY_HOTPLUG
-void online_page(struct page *page)
-{
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalram_pages++;
- num_physpages++;
-}
-
#ifdef CONFIG_NUMA
int memory_add_physaddr_to_nid(u64 start)
{
@@ -111,7 +114,7 @@ int memory_add_physaddr_to_nid(u64 start)
}
#endif
-int __devinit arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdata;
struct zone *zone;
@@ -121,88 +124,60 @@ int __devinit arch_add_memory(int nid, u64 start, u64 size)
pgdata = NODE_DATA(nid);
start = (unsigned long)__va(start);
- create_section_mapping(start, start + size);
+ if (create_section_mapping(start, start + size))
+ return -EINVAL;
/* this should work for most non-highmem platforms */
zone = pgdata->node_zones;
- return __add_pages(zone, start_pfn, nr_pages);
+ return __add_pages(nid, zone, start_pfn, nr_pages);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int remove_memory(u64 start, u64 size)
+int arch_remove_memory(u64 start, u64 size)
{
- unsigned long start_pfn, end_pfn;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
int ret;
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = start_pfn + (size >> PAGE_SHIFT);
- ret = offline_pages(start_pfn, end_pfn, 120 * HZ);
- if (ret)
- goto out;
- /* Arch-specific calls go here - next patch */
-out:
+ zone = page_zone(pfn_to_page(start_pfn));
+ ret = __remove_pages(zone, start_pfn, nr_pages);
+ if (!ret && (ppc_md.remove_memory))
+ ret = ppc_md.remove_memory(start, size);
+
return ret;
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
+#endif
+#endif /* CONFIG_MEMORY_HOTPLUG */
/*
* walk_memory_resource() needs to make sure there is no holes in a given
- * memory range. On PPC64, since this range comes from /sysfs, the range
- * is guaranteed to be valid, non-overlapping and can not contain any
- * holes. By the time we get here (memory add or remove), /proc/device-tree
- * is updated and correct. Only reason we need to check against device-tree
- * would be if we allow user-land to specify a memory range through a
- * system call/ioctl etc. instead of doing offline/online through /sysfs.
+ * memory range. PPC64 does not maintain the memory layout in /proc/iomem.
+ * Instead it maintains it in memblock.memory structures. Walk through the
+ * memory regions, find holes and callback for contiguous regions.
*/
int
-walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
- int (*func)(unsigned long, unsigned long, void *))
+walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg, int (*func)(unsigned long, unsigned long, void *))
{
- return (*func)(start_pfn, nr_pages, arg);
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-void show_mem(void)
-{
- unsigned long total = 0, reserved = 0;
- unsigned long shared = 0, cached = 0;
- unsigned long highmem = 0;
- struct page *page;
- pg_data_t *pgdat;
- unsigned long i;
-
- printk("Mem-info:\n");
- show_free_areas();
- printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
- for_each_online_pgdat(pgdat) {
- unsigned long flags;
- pgdat_resize_lock(pgdat, &flags);
- for (i = 0; i < pgdat->node_spanned_pages; i++) {
- if (!pfn_valid(pgdat->node_start_pfn + i))
- continue;
- page = pgdat_page_nr(pgdat, i);
- total++;
- if (PageHighMem(page))
- highmem++;
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page) - 1;
- }
- pgdat_resize_unlock(pgdat, &flags);
+ struct memblock_region *reg;
+ unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long tstart, tend;
+ int ret = -1;
+
+ for_each_memblock(memory, reg) {
+ tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
+ tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
+ if (tstart >= tend)
+ continue;
+ ret = (*func)(tstart, tend - tstart, arg);
+ if (ret)
+ break;
}
- printk("%ld pages of RAM\n", total);
-#ifdef CONFIG_HIGHMEM
- printk("%ld pages of HIGHMEM\n", highmem);
-#endif
- printk("%ld reserved pages\n", reserved);
- printk("%ld pages shared\n", shared);
- printk("%ld pages swap cached\n", cached);
+ return ret;
}
+EXPORT_SYMBOL_GPL(walk_system_ram_range);
/*
* Initialize the bootmem system and give it all the memory we
@@ -212,14 +187,16 @@ void show_mem(void)
#ifndef CONFIG_NEED_MULTIPLE_NODES
void __init do_init_bootmem(void)
{
- unsigned long i;
unsigned long start, bootmap_pages;
unsigned long total_pages;
+ struct memblock_region *reg;
int boot_mapsize;
- max_pfn = total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
+ max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ total_pages = (memblock_end_of_DRAM() - memstart_addr) >> PAGE_SHIFT;
#ifdef CONFIG_HIGHMEM
total_pages = total_lowmem >> PAGE_SHIFT;
+ max_low_pfn = lowmem_end_addr >> PAGE_SHIFT;
#endif
/*
@@ -229,48 +206,38 @@ void __init do_init_bootmem(void)
*/
bootmap_pages = bootmem_bootmap_pages(total_pages);
- start = lmb_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE);
+ start = memblock_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE);
- boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
+ min_low_pfn = MEMORY_START >> PAGE_SHIFT;
+ boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn);
- /* Add active regions with valid PFNs */
- for (i = 0; i < lmb.memory.cnt; i++) {
- unsigned long start_pfn, end_pfn;
- start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
- end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
- add_active_range(0, start_pfn, end_pfn);
- }
+ /* Place all memblock_regions in the same node and merge contiguous
+ * memblock_regions
+ */
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
/* Add all physical memory to the bootmem map, mark each area
* present.
*/
#ifdef CONFIG_HIGHMEM
- free_bootmem_with_active_regions(0, total_lowmem >> PAGE_SHIFT);
+ free_bootmem_with_active_regions(0, lowmem_end_addr >> PAGE_SHIFT);
/* reserve the sections we're already using */
- for (i = 0; i < lmb.reserved.cnt; i++) {
- unsigned long addr = lmb.reserved.region[i].base +
- lmb_size_bytes(&lmb.reserved, i) - 1;
- if (addr < total_lowmem)
- reserve_bootmem(lmb.reserved.region[i].base,
- lmb_size_bytes(&lmb.reserved, i),
- BOOTMEM_DEFAULT);
- else if (lmb.reserved.region[i].base < total_lowmem) {
- unsigned long adjusted_size = total_lowmem -
- lmb.reserved.region[i].base;
- reserve_bootmem(lmb.reserved.region[i].base,
- adjusted_size, BOOTMEM_DEFAULT);
+ for_each_memblock(reserved, reg) {
+ unsigned long top = reg->base + reg->size - 1;
+ if (top < lowmem_end_addr)
+ reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT);
+ else if (reg->base < lowmem_end_addr) {
+ unsigned long trunc_size = lowmem_end_addr - reg->base;
+ reserve_bootmem(reg->base, trunc_size, BOOTMEM_DEFAULT);
}
}
#else
free_bootmem_with_active_regions(0, max_pfn);
/* reserve the sections we're already using */
- for (i = 0; i < lmb.reserved.cnt; i++)
- reserve_bootmem(lmb.reserved.region[i].base,
- lmb_size_bytes(&lmb.reserved, i),
- BOOTMEM_DEFAULT);
-
+ for_each_memblock(reserved, reg)
+ reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT);
#endif
/* XXX need to clip this if using highmem? */
sparse_memory_present_with_active_regions(0);
@@ -281,22 +248,15 @@ void __init do_init_bootmem(void)
/* mark pages that don't exist as nosave */
static int __init mark_nonram_nosave(void)
{
- unsigned long lmb_next_region_start_pfn,
- lmb_region_max_pfn;
- int i;
-
- for (i = 0; i < lmb.memory.cnt - 1; i++) {
- lmb_region_max_pfn =
- (lmb.memory.region[i].base >> PAGE_SHIFT) +
- (lmb.memory.region[i].size >> PAGE_SHIFT);
- lmb_next_region_start_pfn =
- lmb.memory.region[i+1].base >> PAGE_SHIFT;
-
- if (lmb_region_max_pfn < lmb_next_region_start_pfn)
- register_nosave_region(lmb_region_max_pfn,
- lmb_next_region_start_pfn);
+ struct memblock_region *reg, *prev = NULL;
+
+ for_each_memblock(memory, reg) {
+ if (prev &&
+ memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg))
+ register_nosave_region(memblock_region_memory_end_pfn(prev),
+ memblock_region_memory_base_pfn(reg));
+ prev = reg;
}
-
return 0;
}
@@ -305,28 +265,33 @@ static int __init mark_nonram_nosave(void)
*/
void __init paging_init(void)
{
- unsigned long total_ram = lmb_phys_mem_size();
- unsigned long top_of_ram = lmb_end_of_DRAM();
+ unsigned long long total_ram = memblock_phys_mem_size();
+ phys_addr_t top_of_ram = memblock_end_of_DRAM();
unsigned long max_zone_pfns[MAX_NR_ZONES];
+#ifdef CONFIG_PPC32
+ unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
+ unsigned long end = __fix_to_virt(FIX_HOLE);
+
+ for (; v < end; v += PAGE_SIZE)
+ map_page(v, 0, 0); /* XXX gross */
+#endif
+
#ifdef CONFIG_HIGHMEM
map_page(PKMAP_BASE, 0, 0); /* XXX gross */
- pkmap_page_table = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k
- (PKMAP_BASE), PKMAP_BASE), PKMAP_BASE), PKMAP_BASE);
- map_page(KMAP_FIX_BEGIN, 0, 0); /* XXX gross */
- kmap_pte = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k
- (KMAP_FIX_BEGIN), KMAP_FIX_BEGIN), KMAP_FIX_BEGIN),
- KMAP_FIX_BEGIN);
+ pkmap_page_table = virt_to_kpte(PKMAP_BASE);
+
+ kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
kmap_prot = PAGE_KERNEL;
#endif /* CONFIG_HIGHMEM */
- printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
- top_of_ram, total_ram);
+ printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n",
+ (unsigned long long)top_of_ram, total_ram);
printk(KERN_DEBUG "Memory hole size: %ldMB\n",
- (top_of_ram - total_ram) >> 20);
+ (long int)((top_of_ram - total_ram) >> 20));
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
#ifdef CONFIG_HIGHMEM
- max_zone_pfns[ZONE_DMA] = total_lowmem >> PAGE_SHIFT;
+ max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
#else
max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
@@ -337,80 +302,88 @@ void __init paging_init(void)
}
#endif /* ! CONFIG_NEED_MULTIPLE_NODES */
-void __init mem_init(void)
+static void __init register_page_bootmem_info(void)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- int nid;
-#endif
- pg_data_t *pgdat;
- unsigned long i;
- struct page *page;
- unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
+ int i;
- num_physpages = lmb.memory.size >> PAGE_SHIFT;
- high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+}
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- for_each_online_node(nid) {
- if (NODE_DATA(nid)->node_spanned_pages != 0) {
- printk("freeing bootmem node %d\n", nid);
- totalram_pages +=
- free_all_bootmem_node(NODE_DATA(nid));
- }
- }
-#else
- max_mapnr = max_pfn;
- totalram_pages += free_all_bootmem();
+void __init mem_init(void)
+{
+ /*
+ * book3s is limited to 16 page sizes due to encoding this in
+ * a 4-bit field for slices.
+ */
+ BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
+
+#ifdef CONFIG_SWIOTLB
+ swiotlb_init(0);
#endif
- for_each_online_pgdat(pgdat) {
- for (i = 0; i < pgdat->node_spanned_pages; i++) {
- if (!pfn_valid(pgdat->node_start_pfn + i))
- continue;
- page = pgdat_page_nr(pgdat, i);
- if (PageReserved(page))
- reservedpages++;
- }
- }
- codesize = (unsigned long)&_sdata - (unsigned long)&_stext;
- datasize = (unsigned long)&_edata - (unsigned long)&_sdata;
- initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
- bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
+ register_page_bootmem_info();
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+ set_max_mapnr(max_pfn);
+ free_all_bootmem();
#ifdef CONFIG_HIGHMEM
{
unsigned long pfn, highmem_mapnr;
- highmem_mapnr = total_lowmem >> PAGE_SHIFT;
+ highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT;
for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
+ phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
- if (lmb_is_reserved(pfn << PAGE_SHIFT))
- continue;
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
- reservedpages--;
+ if (!memblock_is_reserved(paddr))
+ free_highmem_page(page);
}
- totalram_pages += totalhigh_pages;
- printk(KERN_DEBUG "High memory: %luk\n",
- totalhigh_pages << (PAGE_SHIFT-10));
}
#endif /* CONFIG_HIGHMEM */
- printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
- "%luk reserved, %luk data, %luk bss, %luk init)\n",
- (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
- num_physpages << (PAGE_SHIFT-10),
- codesize >> 10,
- reservedpages << (PAGE_SHIFT-10),
- datasize >> 10,
- bsssize >> 10,
- initsize >> 10);
+#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP)
+ /*
+ * If smp is enabled, next_tlbcam_idx is initialized in the cpu up
+ * functions.... do it here for the non-smp case.
+ */
+ per_cpu(next_tlbcam_idx, smp_processor_id()) =
+ (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
+#endif
+
+ mem_init_print_info(NULL);
+#ifdef CONFIG_PPC32
+ pr_info("Kernel virtual memory layout:\n");
+ pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP);
+#ifdef CONFIG_HIGHMEM
+ pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n",
+ PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
+#endif /* CONFIG_HIGHMEM */
+#ifdef CONFIG_NOT_COHERENT_CACHE
+ pr_info(" * 0x%08lx..0x%08lx : consistent mem\n",
+ IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE);
+#endif /* CONFIG_NOT_COHERENT_CACHE */
+ pr_info(" * 0x%08lx..0x%08lx : early ioremap\n",
+ ioremap_bot, IOREMAP_TOP);
+ pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n",
+ VMALLOC_START, VMALLOC_END);
+#endif /* CONFIG_PPC32 */
mem_init_done = 1;
}
+void free_initmem(void)
+{
+ ppc_md.progress = ppc_printk_progress;
+ free_initmem_default(POISON_FREE_INITMEM);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init free_initrd_mem(unsigned long start, unsigned long end)
+{
+ free_reserved_area((void *)start, (void *)end, -1, "initrd");
+}
+#endif
+
/*
* This is called when a page has been modified by the kernel.
* It just marks the page as not i-cache clean. We do the i-cache
@@ -428,24 +401,33 @@ EXPORT_SYMBOL(flush_dcache_page);
void flush_dcache_icache_page(struct page *page)
{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageCompound(page)) {
+ flush_dcache_icache_hugepage(page);
+ return;
+ }
+#endif
#ifdef CONFIG_BOOKE
- void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
- __flush_dcache_icache(start);
- kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+ {
+ void *start = kmap_atomic(page);
+ __flush_dcache_icache(start);
+ kunmap_atomic(start);
+ }
#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
/* On 8xx there is no need to kmap since highmem is not supported */
__flush_dcache_icache(page_address(page));
#else
__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
#endif
-
}
+EXPORT_SYMBOL(flush_dcache_icache_page);
+
void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
{
clear_page(page);
/*
- * We shouldnt have to do this, but some versions of glibc
+ * We shouldn't have to do this, but some versions of glibc
* require it (ld.so assumes zero filled pages are icache clean)
* - Anton
*/
@@ -496,46 +478,17 @@ EXPORT_SYMBOL(flush_icache_user_range);
* This must always be called with the pte lock held.
*/
void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t pte)
+ pte_t *ptep)
{
#ifdef CONFIG_PPC_STD_MMU
+ /*
+ * We don't need to worry about _PAGE_PRESENT here because we are
+ * called with either mm->page_table_lock held or ptl lock held
+ */
unsigned long access = 0, trap;
-#endif
- unsigned long pfn = pte_pfn(pte);
-
- /* handle i-cache coherency */
- if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
- !cpu_has_feature(CPU_FTR_NOEXECUTE) &&
- pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
-#ifdef CONFIG_8xx
- /* On 8xx, cache control instructions (particularly
- * "dcbst" from flush_dcache_icache) fault as write
- * operation if there is an unpopulated TLB entry
- * for the address in question. To workaround that,
- * we invalidate the TLB here, thus avoiding dcbst
- * misbehaviour.
- */
- _tlbie(address, 0 /* 8xx doesn't care about PID */);
-#endif
- /* The _PAGE_USER test should really be _PAGE_EXEC, but
- * older glibc versions execute some code from no-exec
- * pages, which for now we are supporting. If exec-only
- * pages are ever implemented, this will have to change.
- */
- if (!PageReserved(page) && (pte_val(pte) & _PAGE_USER)
- && !test_bit(PG_arch_1, &page->flags)) {
- if (vma->vm_mm == current->active_mm) {
- __flush_dcache_icache((void *) address);
- } else
- flush_dcache_icache_page(page);
- set_bit(PG_arch_1, &page->flags);
- }
- }
-#ifdef CONFIG_PPC_STD_MMU
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
- if (!pte_young(pte) || address >= TASK_SIZE)
+ if (!pte_young(*ptep) || address >= TASK_SIZE)
return;
/* We try to figure out if we are coming from an instruction
@@ -554,4 +507,58 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
return;
hash_preload(vma->vm_mm, address, access, trap);
#endif /* CONFIG_PPC_STD_MMU */
+#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
+ && defined(CONFIG_HUGETLB_PAGE)
+ if (is_vm_hugetlb_page(vma))
+ book3e_hugetlb_preload(vma, address, *ptep);
+#endif
+}
+
+/*
+ * System memory should not be in /proc/iomem but various tools expect it
+ * (eg kdump).
+ */
+static int __init add_system_ram_resources(void)
+{
+ struct memblock_region *reg;
+
+ for_each_memblock(memory, reg) {
+ struct resource *res;
+ unsigned long base = reg->base;
+ unsigned long size = reg->size;
+
+ res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+ WARN_ON(!res);
+
+ if (res) {
+ res->name = "System RAM";
+ res->start = base;
+ res->end = base + size - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ WARN_ON(request_resource(&iomem_resource, res) < 0);
+ }
+ }
+
+ return 0;
+}
+subsys_initcall(add_system_ram_resources);
+
+#ifdef CONFIG_STRICT_DEVMEM
+/*
+ * devmem_is_allowed(): check to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ * Access has to be given to non-kernel-ram areas as well, these contain the
+ * PCI mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pfn)
+{
+ if (iomem_is_exclusive(pfn << PAGE_SHIFT))
+ return 0;
+ if (!page_is_ram(pfn))
+ return 1;
+ if (page_is_rtas_user_buf(pfn))
+ return 1;
+ return 0;
}
+#endif /* CONFIG_STRICT_DEVMEM */
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 86010fc7d3b..cb8bdbe4972 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -24,43 +24,59 @@
#include <linux/personality.h>
#include <linux/mm.h>
+#include <linux/random.h>
#include <linux/sched.h>
/*
* Top of mmap area (just below the process stack).
*
- * Leave an at least ~128 MB hole.
+ * Leave at least a ~128 MB hole on 32bit applications.
+ *
+ * On 64bit applications we randomise the stack by 1GB so we need to
+ * space our mmap start address by a further 1GB, otherwise there is a
+ * chance the mmap area will end up closer to the stack than our ulimit
+ * requires.
*/
-#define MIN_GAP (128*1024*1024)
+#define MIN_GAP32 (128*1024*1024)
+#define MIN_GAP64 ((128 + 1024)*1024*1024UL)
+#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64)
#define MAX_GAP (TASK_SIZE/6*5)
-static inline unsigned long mmap_base(void)
+static inline int mmap_is_legacy(void)
{
- unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
+ if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+ return 1;
- return TASK_SIZE - (gap & PAGE_MASK);
+ return sysctl_legacy_va_layout;
}
-static inline int mmap_is_legacy(void)
+static unsigned long mmap_rnd(void)
{
- /*
- * Force standard allocation for 64 bit programs.
- */
- if (!test_thread_flag(TIF_32BIT))
- return 1;
+ unsigned long rnd = 0;
- if (current->personality & ADDR_COMPAT_LAYOUT)
- return 1;
+ if (current->flags & PF_RANDOMIZE) {
+ /* 8MB for 32bit, 1GB for 64bit */
+ if (is_32bit_task())
+ rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
+ else
+ rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
+ }
+ return rnd << PAGE_SHIFT;
+}
- if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
- return 1;
+static inline unsigned long mmap_base(void)
+{
+ unsigned long gap = rlimit(RLIMIT_STACK);
- return sysctl_legacy_va_layout;
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+
+ return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
}
/*
@@ -76,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/powerpc/mm/mmu_context_32.c b/arch/powerpc/mm/mmu_context_32.c
deleted file mode 100644
index cc32ba41d90..00000000000
--- a/arch/powerpc/mm/mmu_context_32.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This file contains the routines for handling the MMU on those
- * PowerPC implementations where the MMU substantially follows the
- * architecture specification. This includes the 6xx, 7xx, 7xxx,
- * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
- * -- paulus
- *
- * Derived from arch/ppc/mm/init.c:
- * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- * and Cort Dougan (PReP) (cort@cs.nmt.edu)
- * Copyright (C) 1996 Paul Mackerras
- *
- * Derived from "arch/i386/mm/init.c"
- * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-
-unsigned long next_mmu_context;
-unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-#ifdef FEW_CONTEXTS
-atomic_t nr_free_contexts;
-struct mm_struct *context_mm[LAST_CONTEXT+1];
-void steal_context(void);
-#endif /* FEW_CONTEXTS */
-
-/*
- * Initialize the context management stuff.
- */
-void __init
-mmu_context_init(void)
-{
- /*
- * Some processors have too few contexts to reserve one for
- * init_mm, and require using context 0 for a normal task.
- * Other processors reserve the use of context zero for the kernel.
- * This code assumes FIRST_CONTEXT < 32.
- */
- context_map[0] = (1 << FIRST_CONTEXT) - 1;
- next_mmu_context = FIRST_CONTEXT;
-#ifdef FEW_CONTEXTS
- atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
-#endif /* FEW_CONTEXTS */
-}
-
-#ifdef FEW_CONTEXTS
-/*
- * Steal a context from a task that has one at the moment.
- * This is only used on 8xx and 4xx and we presently assume that
- * they don't do SMP. If they do then this will have to check
- * whether the MM we steal is in use.
- * We also assume that this is only used on systems that don't
- * use an MMU hash table - this is true for 8xx and 4xx.
- * This isn't an LRU system, it just frees up each context in
- * turn (sort-of pseudo-random replacement :). This would be the
- * place to implement an LRU scheme if anyone was motivated to do it.
- * -- paulus
- */
-void
-steal_context(void)
-{
- struct mm_struct *mm;
-
- /* free up context `next_mmu_context' */
- /* if we shouldn't free context 0, don't... */
- if (next_mmu_context < FIRST_CONTEXT)
- next_mmu_context = FIRST_CONTEXT;
- mm = context_mm[next_mmu_context];
- flush_tlb_mm(mm);
- destroy_context(mm);
-}
-#endif /* FEW_CONTEXTS */
diff --git a/arch/powerpc/mm/mmu_context_64.c b/arch/powerpc/mm/mmu_context_64.c
deleted file mode 100644
index 1db38ba1f54..00000000000
--- a/arch/powerpc/mm/mmu_context_64.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * MMU context allocation for 64-bit kernels.
- *
- * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-
-#include <asm/mmu_context.h>
-
-static DEFINE_SPINLOCK(mmu_context_lock);
-static DEFINE_IDR(mmu_context_idr);
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
- int index;
- int err;
-
-again:
- if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
- return -ENOMEM;
-
- spin_lock(&mmu_context_lock);
- err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
- spin_unlock(&mmu_context_lock);
-
- if (err == -EAGAIN)
- goto again;
- else if (err)
- return err;
-
- if (index > MAX_CONTEXT) {
- spin_lock(&mmu_context_lock);
- idr_remove(&mmu_context_idr, index);
- spin_unlock(&mmu_context_lock);
- return -ENOMEM;
- }
-
- /* The old code would re-promote on fork, we don't do that
- * when using slices as it could cause problem promoting slices
- * that have been forced down to 4K
- */
- if (slice_mm_new_context(mm))
- slice_set_user_psize(mm, mmu_virtual_psize);
- mm->context.id = index;
-
- return 0;
-}
-
-void destroy_context(struct mm_struct *mm)
-{
- spin_lock(&mmu_context_lock);
- idr_remove(&mmu_context_idr, mm->context.id);
- spin_unlock(&mmu_context_lock);
-
- mm->context.id = NO_CONTEXT;
-}
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c
new file mode 100644
index 00000000000..78fef6726e1
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash32.c
@@ -0,0 +1,119 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification. This includes the 6xx, 7xx, 7xxx,
+ * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/export.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+/*
+ * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
+ * (virtual segment identifiers) for each context. Although the
+ * hardware supports 24-bit VSIDs, and thus >1 million contexts,
+ * we only use 32,768 of them. That is ample, since there can be
+ * at most around 30,000 tasks in the system anyway, and it means
+ * that we can use a bitmap to indicate which contexts are in use.
+ * Using a bitmap means that we entirely avoid all of the problems
+ * that we used to have when the context number overflowed,
+ * particularly on SMP systems.
+ * -- paulus.
+ */
+#define NO_CONTEXT ((unsigned long) -1)
+#define LAST_CONTEXT 32767
+#define FIRST_CONTEXT 1
+
+/*
+ * This function defines the mapping from contexts to VSIDs (virtual
+ * segment IDs). We use a skew on both the context and the high 4 bits
+ * of the 32-bit virtual address (the "effective segment ID") in order
+ * to spread out the entries in the MMU hash table. Note, if this
+ * function is changed then arch/ppc/mm/hashtable.S will have to be
+ * changed to correspond.
+ *
+ *
+ * CTX_TO_VSID(ctx, va) (((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \
+ * & 0xffffff)
+ */
+
+static unsigned long next_mmu_context;
+static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
+
+unsigned long __init_new_context(void)
+{
+ unsigned long ctx = next_mmu_context;
+
+ while (test_and_set_bit(ctx, context_map)) {
+ ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
+ if (ctx > LAST_CONTEXT)
+ ctx = 0;
+ }
+ next_mmu_context = (ctx + 1) & LAST_CONTEXT;
+
+ return ctx;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+ mm->context.id = __init_new_context();
+
+ return 0;
+}
+
+/*
+ * Free a context ID. Make sure to call this with preempt disabled!
+ */
+void __destroy_context(unsigned long ctx)
+{
+ clear_bit(ctx, context_map);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+ preempt_disable();
+ if (mm->context.id != NO_CONTEXT) {
+ __destroy_context(mm->context.id);
+ mm->context.id = NO_CONTEXT;
+ }
+ preempt_enable();
+}
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+ /* Reserve context 0 for kernel use */
+ context_map[0] = (1 << FIRST_CONTEXT) - 1;
+ next_mmu_context = FIRST_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
new file mode 100644
index 00000000000..178876aef40
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -0,0 +1,146 @@
+/*
+ * MMU context allocation for 64-bit kernels.
+ *
+ * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+
+#include "icswx.h"
+
+static DEFINE_SPINLOCK(mmu_context_lock);
+static DEFINE_IDA(mmu_context_ida);
+
+int __init_new_context(void)
+{
+ int index;
+ int err;
+
+again:
+ if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
+ return -ENOMEM;
+
+ spin_lock(&mmu_context_lock);
+ err = ida_get_new_above(&mmu_context_ida, 1, &index);
+ spin_unlock(&mmu_context_lock);
+
+ if (err == -EAGAIN)
+ goto again;
+ else if (err)
+ return err;
+
+ if (index > MAX_USER_CONTEXT) {
+ spin_lock(&mmu_context_lock);
+ ida_remove(&mmu_context_ida, index);
+ spin_unlock(&mmu_context_lock);
+ return -ENOMEM;
+ }
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+ int index;
+
+ index = __init_new_context();
+ if (index < 0)
+ return index;
+
+ /* The old code would re-promote on fork, we don't do that
+ * when using slices as it could cause problem promoting slices
+ * that have been forced down to 4K
+ */
+ if (slice_mm_new_context(mm))
+ slice_set_user_psize(mm, mmu_virtual_psize);
+ subpage_prot_init_new_context(mm);
+ mm->context.id = index;
+#ifdef CONFIG_PPC_ICSWX
+ mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+ if (!mm->context.cop_lockp) {
+ __destroy_context(index);
+ subpage_prot_free(mm);
+ mm->context.id = MMU_NO_CONTEXT;
+ return -ENOMEM;
+ }
+ spin_lock_init(mm->context.cop_lockp);
+#endif /* CONFIG_PPC_ICSWX */
+
+#ifdef CONFIG_PPC_64K_PAGES
+ mm->context.pte_frag = NULL;
+#endif
+ return 0;
+}
+
+void __destroy_context(int context_id)
+{
+ spin_lock(&mmu_context_lock);
+ ida_remove(&mmu_context_ida, context_id);
+ spin_unlock(&mmu_context_lock);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+#ifdef CONFIG_PPC_64K_PAGES
+static void destroy_pagetable_page(struct mm_struct *mm)
+{
+ int count;
+ void *pte_frag;
+ struct page *page;
+
+ pte_frag = mm->context.pte_frag;
+ if (!pte_frag)
+ return;
+
+ page = virt_to_page(pte_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
+ if (!count) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+#else
+static inline void destroy_pagetable_page(struct mm_struct *mm)
+{
+ return;
+}
+#endif
+
+
+void destroy_context(struct mm_struct *mm)
+{
+
+#ifdef CONFIG_PPC_ICSWX
+ drop_cop(mm->context.acop, mm);
+ kfree(mm->context.cop_lockp);
+ mm->context.cop_lockp = NULL;
+#endif /* CONFIG_PPC_ICSWX */
+
+ destroy_pagetable_page(mm);
+ __destroy_context(mm->context.id);
+ subpage_prot_free(mm);
+ mm->context.id = MMU_NO_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
new file mode 100644
index 00000000000..928ebe79668
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -0,0 +1,449 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU is not using the hash
+ * table, such as 8xx, 4xx, BookE's etc...
+ *
+ * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
+ * IBM Corp.
+ *
+ * Derived from previous arch/powerpc/mm/mmu_context.c
+ * and arch/powerpc/include/asm/mmu_context.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * TODO:
+ *
+ * - The global context lock will not scale very well
+ * - The maps should be dynamically allocated to allow for processors
+ * that support more PID bits at runtime
+ * - Implement flush_tlb_mm() by making the context stale and picking
+ * a new one
+ * - More aggressively clear stale map bits and maybe find some way to
+ * also clear mm->cpu_vm_mask bits when processes are migrated
+ */
+
+//#define DEBUG_MAP_CONSISTENCY
+//#define DEBUG_CLAMP_LAST_CONTEXT 31
+//#define DEBUG_HARDER
+
+/* We don't use DEBUG because it tends to be compiled in always nowadays
+ * and this would generate way too much output
+ */
+#ifdef DEBUG_HARDER
+#define pr_hard(args...) printk(KERN_DEBUG args)
+#define pr_hardcont(args...) printk(KERN_CONT args)
+#else
+#define pr_hard(args...) do { } while(0)
+#define pr_hardcont(args...) do { } while(0)
+#endif
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+static unsigned int first_context, last_context;
+static unsigned int next_context, nr_free_contexts;
+static unsigned long *context_map;
+static unsigned long *stale_map[NR_CPUS];
+static struct mm_struct **context_mm;
+static DEFINE_RAW_SPINLOCK(context_lock);
+
+#define CTX_MAP_SIZE \
+ (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
+
+
+/* Steal a context from a task that has one at the moment.
+ *
+ * This is used when we are running out of available PID numbers
+ * on the processors.
+ *
+ * This isn't an LRU system, it just frees up each context in
+ * turn (sort-of pseudo-random replacement :). This would be the
+ * place to implement an LRU scheme if anyone was motivated to do it.
+ * -- paulus
+ *
+ * For context stealing, we use a slightly different approach for
+ * SMP and UP. Basically, the UP one is simpler and doesn't use
+ * the stale map as we can just flush the local CPU
+ * -- benh
+ */
+#ifdef CONFIG_SMP
+static unsigned int steal_context_smp(unsigned int id)
+{
+ struct mm_struct *mm;
+ unsigned int cpu, max, i;
+
+ max = last_context - first_context;
+
+ /* Attempt to free next_context first and then loop until we manage */
+ while (max--) {
+ /* Pick up the victim mm */
+ mm = context_mm[id];
+
+ /* We have a candidate victim, check if it's active, on SMP
+ * we cannot steal active contexts
+ */
+ if (mm->context.active) {
+ id++;
+ if (id > last_context)
+ id = first_context;
+ continue;
+ }
+ pr_hardcont(" | steal %d from 0x%p", id, mm);
+
+ /* Mark this mm has having no context anymore */
+ mm->context.id = MMU_NO_CONTEXT;
+
+ /* Mark it stale on all CPUs that used this mm. For threaded
+ * implementations, we set it on all threads on each core
+ * represented in the mask. A future implementation will use
+ * a core map instead but this will do for now.
+ */
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ for (i = cpu_first_thread_sibling(cpu);
+ i <= cpu_last_thread_sibling(cpu); i++) {
+ if (stale_map[i])
+ __set_bit(id, stale_map[i]);
+ }
+ cpu = i - 1;
+ }
+ return id;
+ }
+
+ /* This will happen if you have more CPUs than available contexts,
+ * all we can do here is wait a bit and try again
+ */
+ raw_spin_unlock(&context_lock);
+ cpu_relax();
+ raw_spin_lock(&context_lock);
+
+ /* This will cause the caller to try again */
+ return MMU_NO_CONTEXT;
+}
+#endif /* CONFIG_SMP */
+
+/* Note that this will also be called on SMP if all other CPUs are
+ * offlined, which means that it may be called for cpu != 0. For
+ * this to work, we somewhat assume that CPUs that are onlined
+ * come up with a fully clean TLB (or are cleaned when offlined)
+ */
+static unsigned int steal_context_up(unsigned int id)
+{
+ struct mm_struct *mm;
+ int cpu = smp_processor_id();
+
+ /* Pick up the victim mm */
+ mm = context_mm[id];
+
+ pr_hardcont(" | steal %d from 0x%p", id, mm);
+
+ /* Flush the TLB for that context */
+ local_flush_tlb_mm(mm);
+
+ /* Mark this mm has having no context anymore */
+ mm->context.id = MMU_NO_CONTEXT;
+
+ /* XXX This clear should ultimately be part of local_flush_tlb_mm */
+ __clear_bit(id, stale_map[cpu]);
+
+ return id;
+}
+
+#ifdef DEBUG_MAP_CONSISTENCY
+static void context_check_map(void)
+{
+ unsigned int id, nrf, nact;
+
+ nrf = nact = 0;
+ for (id = first_context; id <= last_context; id++) {
+ int used = test_bit(id, context_map);
+ if (!used)
+ nrf++;
+ if (used != (context_mm[id] != NULL))
+ pr_err("MMU: Context %d is %s and MM is %p !\n",
+ id, used ? "used" : "free", context_mm[id]);
+ if (context_mm[id] != NULL)
+ nact += context_mm[id]->context.active;
+ }
+ if (nrf != nr_free_contexts) {
+ pr_err("MMU: Free context count out of sync ! (%d vs %d)\n",
+ nr_free_contexts, nrf);
+ nr_free_contexts = nrf;
+ }
+ if (nact > num_online_cpus())
+ pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
+ nact, num_online_cpus());
+ if (first_context > 0 && !test_bit(0, context_map))
+ pr_err("MMU: Context 0 has been freed !!!\n");
+}
+#else
+static void context_check_map(void) { }
+#endif
+
+void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+ unsigned int i, id, cpu = smp_processor_id();
+ unsigned long *map;
+
+ /* No lockless fast path .. yet */
+ raw_spin_lock(&context_lock);
+
+ pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
+ cpu, next, next->context.active, next->context.id);
+
+#ifdef CONFIG_SMP
+ /* Mark us active and the previous one not anymore */
+ next->context.active++;
+ if (prev) {
+ pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active);
+ WARN_ON(prev->context.active < 1);
+ prev->context.active--;
+ }
+
+ again:
+#endif /* CONFIG_SMP */
+
+ /* If we already have a valid assigned context, skip all that */
+ id = next->context.id;
+ if (likely(id != MMU_NO_CONTEXT)) {
+#ifdef DEBUG_MAP_CONSISTENCY
+ if (context_mm[id] != next)
+ pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n",
+ next, id, id, context_mm[id]);
+#endif
+ goto ctxt_ok;
+ }
+
+ /* We really don't have a context, let's try to acquire one */
+ id = next_context;
+ if (id > last_context)
+ id = first_context;
+ map = context_map;
+
+ /* No more free contexts, let's try to steal one */
+ if (nr_free_contexts == 0) {
+#ifdef CONFIG_SMP
+ if (num_online_cpus() > 1) {
+ id = steal_context_smp(id);
+ if (id == MMU_NO_CONTEXT)
+ goto again;
+ goto stolen;
+ }
+#endif /* CONFIG_SMP */
+ id = steal_context_up(id);
+ goto stolen;
+ }
+ nr_free_contexts--;
+
+ /* We know there's at least one free context, try to find it */
+ while (__test_and_set_bit(id, map)) {
+ id = find_next_zero_bit(map, last_context+1, id);
+ if (id > last_context)
+ id = first_context;
+ }
+ stolen:
+ next_context = id + 1;
+ context_mm[id] = next;
+ next->context.id = id;
+ pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts);
+
+ context_check_map();
+ ctxt_ok:
+
+ /* If that context got marked stale on this CPU, then flush the
+ * local TLB for it and unmark it before we use it
+ */
+ if (test_bit(id, stale_map[cpu])) {
+ pr_hardcont(" | stale flush %d [%d..%d]",
+ id, cpu_first_thread_sibling(cpu),
+ cpu_last_thread_sibling(cpu));
+
+ local_flush_tlb_mm(next);
+
+ /* XXX This clear should ultimately be part of local_flush_tlb_mm */
+ for (i = cpu_first_thread_sibling(cpu);
+ i <= cpu_last_thread_sibling(cpu); i++) {
+ if (stale_map[i])
+ __clear_bit(id, stale_map[i]);
+ }
+ }
+
+ /* Flick the MMU and release lock */
+ pr_hardcont(" -> %d\n", id);
+ set_context(id, next->pgd);
+ raw_spin_unlock(&context_lock);
+}
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+ pr_hard("initing context for mm @%p\n", mm);
+
+ mm->context.id = MMU_NO_CONTEXT;
+ mm->context.active = 0;
+
+#ifdef CONFIG_PPC_MM_SLICES
+ if (slice_mm_new_context(mm))
+ slice_set_user_psize(mm, mmu_virtual_psize);
+#endif
+
+ return 0;
+}
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+ unsigned long flags;
+ unsigned int id;
+
+ if (mm->context.id == MMU_NO_CONTEXT)
+ return;
+
+ WARN_ON(mm->context.active != 0);
+
+ raw_spin_lock_irqsave(&context_lock, flags);
+ id = mm->context.id;
+ if (id != MMU_NO_CONTEXT) {
+ __clear_bit(id, context_map);
+ mm->context.id = MMU_NO_CONTEXT;
+#ifdef DEBUG_MAP_CONSISTENCY
+ mm->context.active = 0;
+#endif
+ context_mm[id] = NULL;
+ nr_free_contexts++;
+ }
+ raw_spin_unlock_irqrestore(&context_lock, flags);
+}
+
+#ifdef CONFIG_SMP
+
+static int mmu_context_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned int)(long)hcpu;
+
+ /* We don't touch CPU 0 map, it's allocated at aboot and kept
+ * around forever
+ */
+ if (cpu == boot_cpuid)
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
+ stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
+ kfree(stale_map[cpu]);
+ stale_map[cpu] = NULL;
+
+ /* We also clear the cpu_vm_mask bits of CPUs going away */
+ clear_tasks_mm_cpumask(cpu);
+ break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mmu_context_cpu_nb = {
+ .notifier_call = mmu_context_cpu_notify,
+};
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+ /* Mark init_mm as being active on all possible CPUs since
+ * we'll get called with prev == init_mm the first time
+ * we schedule on a given CPU
+ */
+ init_mm.context.active = NR_CPUS;
+
+ /*
+ * The MPC8xx has only 16 contexts. We rotate through them on each
+ * task switch. A better way would be to keep track of tasks that
+ * own contexts, and implement an LRU usage. That way very active
+ * tasks don't always have to pay the TLB reload overhead. The
+ * kernel pages are mapped shared, so the kernel can run on behalf
+ * of any task that makes a kernel entry. Shared does not mean they
+ * are not protected, just that the ASID comparison is not performed.
+ * -- Dan
+ *
+ * The IBM4xx has 256 contexts, so we can just rotate through these
+ * as a way of "switching" contexts. If the TID of the TLB is zero,
+ * the PID/TID comparison is disabled, so we can use a TID of zero
+ * to represent all kernel pages as shared among all contexts.
+ * -- Dan
+ *
+ * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We
+ * should normally never have to steal though the facility is
+ * present if needed.
+ * -- BenH
+ */
+ if (mmu_has_feature(MMU_FTR_TYPE_8xx)) {
+ first_context = 0;
+ last_context = 15;
+ } else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
+ first_context = 1;
+ last_context = 65535;
+ } else {
+ first_context = 1;
+ last_context = 255;
+ }
+
+#ifdef DEBUG_CLAMP_LAST_CONTEXT
+ last_context = DEBUG_CLAMP_LAST_CONTEXT;
+#endif
+ /*
+ * Allocate the maps used by context management
+ */
+ context_map = alloc_bootmem(CTX_MAP_SIZE);
+ context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1));
+#ifndef CONFIG_SMP
+ stale_map[0] = alloc_bootmem(CTX_MAP_SIZE);
+#else
+ stale_map[boot_cpuid] = alloc_bootmem(CTX_MAP_SIZE);
+
+ register_cpu_notifier(&mmu_context_cpu_nb);
+#endif
+
+ printk(KERN_INFO
+ "MMU: Allocated %zu bytes of context maps for %d contexts\n",
+ 2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)),
+ last_context - first_context + 1);
+
+ /*
+ * Some processors have too few contexts to reserve one for
+ * init_mm, and require using context 0 for a normal task.
+ * Other processors reserve the use of context zero for the kernel.
+ * This code assumes first_context < 32.
+ */
+ context_map[0] = (1 << first_context) - 1;
+ next_context = first_context;
+ nr_free_contexts = last_context - first_context + 1;
+}
+
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index ebfd13dc9d1..9615d82919b 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -22,20 +22,84 @@
#include <asm/tlbflush.h>
#include <asm/mmu.h>
+#ifdef CONFIG_PPC_MMU_NOHASH
+
+/*
+ * On 40x and 8xx, we directly inline tlbia and tlbivax
+ */
+#if defined(CONFIG_40x) || defined(CONFIG_8xx)
+static inline void _tlbil_all(void)
+{
+ asm volatile ("sync; tlbia; isync" : : : "memory");
+}
+static inline void _tlbil_pid(unsigned int pid)
+{
+ asm volatile ("sync; tlbia; isync" : : : "memory");
+}
+#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
+
+#else /* CONFIG_40x || CONFIG_8xx */
+extern void _tlbil_all(void);
+extern void _tlbil_pid(unsigned int pid);
+#ifdef CONFIG_PPC_BOOK3E
+extern void _tlbil_pid_noind(unsigned int pid);
+#else
+#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
+#endif
+#endif /* !(CONFIG_40x || CONFIG_8xx) */
+
+/*
+ * On 8xx, we directly inline tlbie, on others, it's extern
+ */
+#ifdef CONFIG_8xx
+static inline void _tlbil_va(unsigned long address, unsigned int pid,
+ unsigned int tsize, unsigned int ind)
+{
+ asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
+}
+#elif defined(CONFIG_PPC_BOOK3E)
+extern void _tlbil_va(unsigned long address, unsigned int pid,
+ unsigned int tsize, unsigned int ind);
+#else
+extern void __tlbil_va(unsigned long address, unsigned int pid);
+static inline void _tlbil_va(unsigned long address, unsigned int pid,
+ unsigned int tsize, unsigned int ind)
+{
+ __tlbil_va(address, pid);
+}
+#endif /* CONIFG_8xx */
+
+#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x)
+extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
+ unsigned int tsize, unsigned int ind);
+#else
+static inline void _tlbivax_bcast(unsigned long address, unsigned int pid,
+ unsigned int tsize, unsigned int ind)
+{
+ BUG();
+}
+#endif
+
+#else /* CONFIG_PPC_MMU_NOHASH */
+
extern void hash_preload(struct mm_struct *mm, unsigned long ea,
unsigned long access, unsigned long trap);
+extern void _tlbie(unsigned long address);
+extern void _tlbia(void);
+
+#endif /* CONFIG_PPC_MMU_NOHASH */
+
#ifdef CONFIG_PPC32
+
extern void mapin_ram(void);
extern int map_page(unsigned long va, phys_addr_t pa, int flags);
-extern void setbat(int index, unsigned long virt, unsigned long phys,
+extern void setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, int flags);
-extern void settlbcam(int index, unsigned long virt, phys_addr_t phys,
- unsigned int size, int flags, unsigned int pid);
-extern void invalidate_tlbcam_entry(int index);
extern int __map_without_bats;
+extern int __allow_ioremap_reserved;
extern unsigned long ioremap_base;
extern unsigned int rtas_data, rtas_size;
@@ -43,50 +107,61 @@ struct hash_pte;
extern struct hash_pte *Hash, *Hash_end;
extern unsigned long Hash_size, Hash_mask;
-extern unsigned int num_tlbcam_entries;
-#endif
+#endif /* CONFIG_PPC32 */
+
+#ifdef CONFIG_PPC64
+extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags);
+#endif /* CONFIG_PPC64 */
extern unsigned long ioremap_bot;
extern unsigned long __max_low_memory;
-extern unsigned long __initial_memory_limit;
-extern unsigned long total_memory;
-extern unsigned long total_lowmem;
+extern phys_addr_t __initial_memory_limit_addr;
+extern phys_addr_t total_memory;
+extern phys_addr_t total_lowmem;
+extern phys_addr_t memstart_addr;
+extern phys_addr_t lowmem_end_addr;
+
+#ifdef CONFIG_WII
+extern unsigned long wii_hole_start;
+extern unsigned long wii_hole_size;
+
+extern unsigned long wii_mmu_mapin_mem2(unsigned long top);
+extern void wii_memory_fixups(void);
+#endif
/* ...and now those things that may be slightly different between processor
* architectures. -- Dan
*/
#if defined(CONFIG_8xx)
-#define flush_HPTE(X, va, pg) _tlbie(va, 0 /* 8xx doesn't care about PID */)
#define MMU_init_hw() do { } while(0)
-#define mmu_mapin_ram() (0UL)
+#define mmu_mapin_ram(top) (0UL)
#elif defined(CONFIG_4xx)
-#define flush_HPTE(pid, va, pg) _tlbie(va, pid)
extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(void);
+extern unsigned long mmu_mapin_ram(unsigned long top);
-#elif defined(CONFIG_FSL_BOOKE)
-#define flush_HPTE(pid, va, pg) _tlbie(va, pid)
+#elif defined(CONFIG_PPC_FSL_BOOK3E)
+extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx);
+extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
+ phys_addr_t phys);
+#ifdef CONFIG_PPC32
extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(void);
+extern unsigned long mmu_mapin_ram(unsigned long top);
extern void adjust_total_lowmem(void);
+extern int switch_to_as1(void);
+extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
+#endif
+extern void loadcam_entry(unsigned int index);
+struct tlbcam {
+ u32 MAS0;
+ u32 MAS1;
+ unsigned long MAS2;
+ u32 MAS3;
+ u32 MAS7;
+};
#elif defined(CONFIG_PPC32)
/* anything 32-bit except 4xx or 8xx */
extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(void);
-
-/* Be careful....this needs to be updated if we ever encounter 603 SMPs,
- * which includes all new 82xx processors. We need tlbie/tlbsync here
- * in that case (I think). -- Dan.
- */
-static inline void flush_HPTE(unsigned context, unsigned long va,
- unsigned long pdval)
-{
- if ((Hash != 0) &&
- cpu_has_feature(CPU_FTR_HPTE_TABLE))
- flush_hash_pages(0, va, pdval, 1);
- else
- _tlbie(va);
-}
+extern unsigned long mmu_mapin_ram(unsigned long top);
#endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index a300d254aac..3b181b22cd4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -13,14 +13,31 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/nodemask.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/memblock.h>
+#include <linux/of.h>
+#include <linux/pfn.h>
+#include <linux/cpuset.h>
+#include <linux/node.h>
+#include <linux/stop_machine.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <asm/cputhreads.h>
#include <asm/sparsemem.h>
-#include <asm/lmb.h>
-#include <asm/system.h>
+#include <asm/prom.h>
#include <asm/smp.h>
+#include <asm/cputhreads.h>
+#include <asm/topology.h>
+#include <asm/firmware.h>
+#include <asm/paca.h>
+#include <asm/hvcall.h>
+#include <asm/setup.h>
+#include <asm/vdso.h>
static int numa_enabled = 1;
@@ -30,18 +47,45 @@ static int numa_debug;
#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
int numa_cpu_lookup_table[NR_CPUS];
-cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
+cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
struct pglist_data *node_data[MAX_NUMNODES];
EXPORT_SYMBOL(numa_cpu_lookup_table);
-EXPORT_SYMBOL(numa_cpumask_lookup_table);
+EXPORT_SYMBOL(node_to_cpumask_map);
EXPORT_SYMBOL(node_data);
-static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
static int min_common_depth;
static int n_mem_addr_cells, n_mem_size_cells;
+static int form1_affinity;
-static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
+#define MAX_DISTANCE_REF_POINTS 4
+static int distance_ref_points_depth;
+static const __be32 *distance_ref_points;
+static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: cpumask_of_node() is not valid until after this is done.
+ */
+static void __init setup_node_to_cpumask_map(void)
+{
+ unsigned int node;
+
+ /* setup nr_node_ids if not done yet */
+ if (nr_node_ids == MAX_NUMNODES)
+ setup_nr_node_ids();
+
+ /* allocate the map */
+ for (node = 0; node < nr_node_ids; node++)
+ alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
+
+ /* cpumask_of_node() will now work */
+ dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
+}
+
+static int __init fake_numa_create_new_node(unsigned long end_pfn,
unsigned int *nid)
{
unsigned long long mem;
@@ -88,92 +132,163 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
return 0;
}
-static void __cpuinit map_cpu_to_node(int cpu, int node)
+/*
+ * get_node_active_region - Return active region containing pfn
+ * Active range returned is empty if none found.
+ * @pfn: The page to return the region for
+ * @node_ar: Returned set to the active region containing @pfn
+ */
+static void __init get_node_active_region(unsigned long pfn,
+ struct node_active_region *node_ar)
+{
+ unsigned long start_pfn, end_pfn;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ if (pfn >= start_pfn && pfn < end_pfn) {
+ node_ar->nid = nid;
+ node_ar->start_pfn = start_pfn;
+ node_ar->end_pfn = end_pfn;
+ break;
+ }
+ }
+}
+
+static void reset_numa_cpu_lookup_table(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ numa_cpu_lookup_table[cpu] = -1;
+}
+
+static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
{
numa_cpu_lookup_table[cpu] = node;
+}
+
+static void map_cpu_to_node(int cpu, int node)
+{
+ update_numa_cpu_lookup_table(cpu, node);
dbg("adding cpu %d to node %d\n", cpu, node);
- if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))
- cpu_set(cpu, numa_cpumask_lookup_table[node]);
+ if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
+ cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
}
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
static void unmap_cpu_from_node(unsigned long cpu)
{
int node = numa_cpu_lookup_table[cpu];
dbg("removing cpu %lu from node %d\n", cpu, node);
- if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
- cpu_clear(cpu, numa_cpumask_lookup_table[node]);
+ if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
} else {
printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
cpu, node);
}
}
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
+
+/* must hold reference to node during call */
+static const __be32 *of_get_associativity(struct device_node *dev)
+{
+ return of_get_property(dev, "ibm,associativity", NULL);
+}
-static struct device_node * __cpuinit find_cpu_node(unsigned int cpu)
+/*
+ * Returns the property linux,drconf-usable-memory if
+ * it exists (the property exists only in kexec/kdump kernels,
+ * added by kexec-tools)
+ */
+static const __be32 *of_get_usable_memory(struct device_node *memory)
{
- unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
- struct device_node *cpu_node = NULL;
- const unsigned int *interrupt_server, *reg;
- int len;
+ const __be32 *prop;
+ u32 len;
+ prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
+ if (!prop || len < sizeof(unsigned int))
+ return NULL;
+ return prop;
+}
- while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
- /* Try interrupt server first */
- interrupt_server = of_get_property(cpu_node,
- "ibm,ppc-interrupt-server#s", &len);
+int __node_distance(int a, int b)
+{
+ int i;
+ int distance = LOCAL_DISTANCE;
- len = len / sizeof(u32);
+ if (!form1_affinity)
+ return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
- if (interrupt_server && (len > 0)) {
- while (len--) {
- if (interrupt_server[len] == hw_cpuid)
- return cpu_node;
- }
- } else {
- reg = of_get_property(cpu_node, "reg", &len);
- if (reg && (len > 0) && (reg[0] == hw_cpuid))
- return cpu_node;
- }
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
+ break;
+
+ /* Double the distance for each NUMA level */
+ distance *= 2;
}
- return NULL;
+ return distance;
}
+EXPORT_SYMBOL(__node_distance);
-/* must hold reference to node during call */
-static const int *of_get_associativity(struct device_node *dev)
+static void initialize_distance_lookup_table(int nid,
+ const __be32 *associativity)
{
- return of_get_property(dev, "ibm,associativity", NULL);
+ int i;
+
+ if (!form1_affinity)
+ return;
+
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ const __be32 *entry;
+
+ entry = &associativity[be32_to_cpu(distance_ref_points[i])];
+ distance_lookup_table[nid][i] = of_read_number(entry, 1);
+ }
}
/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
* info is found.
*/
-static int of_node_to_nid_single(struct device_node *device)
+static int associativity_to_nid(const __be32 *associativity)
{
int nid = -1;
- const unsigned int *tmp;
if (min_common_depth == -1)
goto out;
- tmp = of_get_associativity(device);
- if (!tmp)
- goto out;
-
- if (tmp[0] >= min_common_depth)
- nid = tmp[min_common_depth];
+ if (of_read_number(associativity, 1) >= min_common_depth)
+ nid = of_read_number(&associativity[min_common_depth], 1);
/* POWER4 LPAR uses 0xffff as invalid node */
if (nid == 0xffff || nid >= MAX_NUMNODES)
nid = -1;
+
+ if (nid > 0 &&
+ of_read_number(associativity, 1) >= distance_ref_points_depth)
+ initialize_distance_lookup_table(nid, associativity);
+
out:
return nid;
}
+/* Returns the nid associated with the given device tree node,
+ * or -1 if not found.
+ */
+static int of_node_to_nid_single(struct device_node *device)
+{
+ int nid = -1;
+ const __be32 *tmp;
+
+ tmp = of_get_associativity(device);
+ if (tmp)
+ nid = associativity_to_nid(tmp);
+ return nid;
+}
+
/* Walk the device tree upwards, looking for an associativity id */
int of_node_to_nid(struct device_node *device)
{
@@ -196,50 +311,75 @@ int of_node_to_nid(struct device_node *device)
}
EXPORT_SYMBOL_GPL(of_node_to_nid);
-/*
- * In theory, the "ibm,associativity" property may contain multiple
- * associativity lists because a resource may be multiply connected
- * into the machine. This resource then has different associativity
- * characteristics relative to its multiple connections. We ignore
- * this for now. We also assume that all cpu and memory sets have
- * their distances represented at a common level. This won't be
- * true for hierarchical NUMA.
- *
- * In any case the ibm,associativity-reference-points should give
- * the correct depth for a normal NUMA system.
- *
- * - Dave Hansen <haveblue@us.ibm.com>
- */
static int __init find_min_common_depth(void)
{
int depth;
- const unsigned int *ref_points;
- struct device_node *rtas_root;
- unsigned int len;
-
- rtas_root = of_find_node_by_path("/rtas");
+ struct device_node *root;
- if (!rtas_root)
- return -1;
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ root = of_find_node_by_path("/ibm,opal");
+ else
+ root = of_find_node_by_path("/rtas");
+ if (!root)
+ root = of_find_node_by_path("/");
/*
- * this property is 2 32-bit integers, each representing a level of
- * depth in the associativity nodes. The first is for an SMP
- * configuration (should be all 0's) and the second is for a normal
- * NUMA configuration.
+ * This property is a set of 32-bit integers, each representing
+ * an index into the ibm,associativity nodes.
+ *
+ * With form 0 affinity the first integer is for an SMP configuration
+ * (should be all 0's) and the second is for a normal NUMA
+ * configuration. We have only one level of NUMA.
+ *
+ * With form 1 affinity the first integer is the most significant
+ * NUMA boundary and the following are progressively less significant
+ * boundaries. There can be more than one level of NUMA.
*/
- ref_points = of_get_property(rtas_root,
- "ibm,associativity-reference-points", &len);
+ distance_ref_points = of_get_property(root,
+ "ibm,associativity-reference-points",
+ &distance_ref_points_depth);
- if ((len >= 1) && ref_points) {
- depth = ref_points[1];
- } else {
+ if (!distance_ref_points) {
dbg("NUMA: ibm,associativity-reference-points not found.\n");
- depth = -1;
+ goto err;
+ }
+
+ distance_ref_points_depth /= sizeof(int);
+
+ if (firmware_has_feature(FW_FEATURE_OPAL) ||
+ firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
+ dbg("Using form 1 affinity\n");
+ form1_affinity = 1;
+ }
+
+ if (form1_affinity) {
+ depth = of_read_number(distance_ref_points, 1);
+ } else {
+ if (distance_ref_points_depth < 2) {
+ printk(KERN_WARNING "NUMA: "
+ "short ibm,associativity-reference-points\n");
+ goto err;
+ }
+
+ depth = of_read_number(&distance_ref_points[1], 1);
}
- of_node_put(rtas_root);
+ /*
+ * Warn and cap if the hardware supports more than
+ * MAX_DISTANCE_REF_POINTS domains.
+ */
+ if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+ printk(KERN_WARNING "NUMA: distance array capped at "
+ "%d entries\n", MAX_DISTANCE_REF_POINTS);
+ distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+ }
+
+ of_node_put(root);
return depth;
+
+err:
+ of_node_put(root);
+ return -1;
}
static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
@@ -255,35 +395,174 @@ static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
of_node_put(memory);
}
-static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
+static unsigned long read_n_cells(int n, const __be32 **buf)
{
unsigned long result = 0;
while (n--) {
- result = (result << 32) | **buf;
+ result = (result << 32) | of_read_number(*buf, 1);
(*buf)++;
}
return result;
}
/*
+ * Read the next memblock list entry from the ibm,dynamic-memory property
+ * and return the information in the provided of_drconf_cell structure.
+ */
+static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp)
+{
+ const __be32 *cp;
+
+ drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
+
+ cp = *cellp;
+ drmem->drc_index = of_read_number(cp, 1);
+ drmem->reserved = of_read_number(&cp[1], 1);
+ drmem->aa_index = of_read_number(&cp[2], 1);
+ drmem->flags = of_read_number(&cp[3], 1);
+
+ *cellp = cp + 4;
+}
+
+/*
+ * Retrieve and validate the ibm,dynamic-memory property of the device tree.
+ *
+ * The layout of the ibm,dynamic-memory property is a number N of memblock
+ * list entries followed by N memblock list entries. Each memblock list entry
+ * contains information as laid out in the of_drconf_cell struct above.
+ */
+static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm)
+{
+ const __be32 *prop;
+ u32 len, entries;
+
+ prop = of_get_property(memory, "ibm,dynamic-memory", &len);
+ if (!prop || len < sizeof(unsigned int))
+ return 0;
+
+ entries = of_read_number(prop++, 1);
+
+ /* Now that we know the number of entries, revalidate the size
+ * of the property read in to ensure we have everything
+ */
+ if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
+ return 0;
+
+ *dm = prop;
+ return entries;
+}
+
+/*
+ * Retrieve and validate the ibm,lmb-size property for drconf memory
+ * from the device tree.
+ */
+static u64 of_get_lmb_size(struct device_node *memory)
+{
+ const __be32 *prop;
+ u32 len;
+
+ prop = of_get_property(memory, "ibm,lmb-size", &len);
+ if (!prop || len < sizeof(unsigned int))
+ return 0;
+
+ return read_n_cells(n_mem_size_cells, &prop);
+}
+
+struct assoc_arrays {
+ u32 n_arrays;
+ u32 array_sz;
+ const __be32 *arrays;
+};
+
+/*
+ * Retrieve and validate the list of associativity arrays for drconf
+ * memory from the ibm,associativity-lookup-arrays property of the
+ * device tree..
+ *
+ * The layout of the ibm,associativity-lookup-arrays property is a number N
+ * indicating the number of associativity arrays, followed by a number M
+ * indicating the size of each associativity array, followed by a list
+ * of N associativity arrays.
+ */
+static int of_get_assoc_arrays(struct device_node *memory,
+ struct assoc_arrays *aa)
+{
+ const __be32 *prop;
+ u32 len;
+
+ prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
+ if (!prop || len < 2 * sizeof(unsigned int))
+ return -1;
+
+ aa->n_arrays = of_read_number(prop++, 1);
+ aa->array_sz = of_read_number(prop++, 1);
+
+ /* Now that we know the number of arrays and size of each array,
+ * revalidate the size of the property read in.
+ */
+ if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
+ return -1;
+
+ aa->arrays = prop;
+ return 0;
+}
+
+/*
+ * This is like of_node_to_nid_single() for memory represented in the
+ * ibm,dynamic-reconfiguration-memory node.
+ */
+static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
+ struct assoc_arrays *aa)
+{
+ int default_nid = 0;
+ int nid = default_nid;
+ int index;
+
+ if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
+ !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
+ drmem->aa_index < aa->n_arrays) {
+ index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
+ nid = of_read_number(&aa->arrays[index], 1);
+
+ if (nid == 0xffff || nid >= MAX_NUMNODES)
+ nid = default_nid;
+ }
+
+ return nid;
+}
+
+/*
* Figure out to which domain a cpu belongs and stick it there.
* Return the id of the domain used.
*/
-static int __cpuinit numa_setup_cpu(unsigned long lcpu)
+static int numa_setup_cpu(unsigned long lcpu)
{
- int nid = 0;
- struct device_node *cpu = find_cpu_node(lcpu);
+ int nid;
+ struct device_node *cpu;
+
+ /*
+ * If a valid cpu-to-node mapping is already available, use it
+ * directly instead of querying the firmware, since it represents
+ * the most recent mapping notified to us by the platform (eg: VPHN).
+ */
+ if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
+ map_cpu_to_node(lcpu, nid);
+ return nid;
+ }
+
+ cpu = of_get_cpu_node(lcpu, NULL);
if (!cpu) {
WARN_ON(1);
+ nid = 0;
goto out;
}
nid = of_node_to_nid_single(cpu);
if (nid < 0 || !node_online(nid))
- nid = any_online_node(NODE_MASK_ALL);
+ nid = first_online_node;
out:
map_cpu_to_node(lcpu, nid);
@@ -292,17 +571,38 @@ out:
return nid;
}
-static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
- unsigned long action,
+static void verify_cpu_node_mapping(int cpu, int node)
+{
+ int base, sibling, i;
+
+ /* Verify that all the threads in the core belong to the same node */
+ base = cpu_first_thread_sibling(cpu);
+
+ for (i = 0; i < threads_per_core; i++) {
+ sibling = base + i;
+
+ if (sibling == cpu || cpu_is_offline(sibling))
+ continue;
+
+ if (cpu_to_node(sibling) != node) {
+ WARN(1, "CPU thread siblings %d and %d don't belong"
+ " to the same node!\n", cpu, sibling);
+ break;
+ }
+ }
+}
+
+static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
unsigned long lcpu = (unsigned long)hcpu;
- int ret = NOTIFY_DONE;
+ int ret = NOTIFY_DONE, nid;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- numa_setup_cpu(lcpu);
+ nid = numa_setup_cpu(lcpu);
+ verify_cpu_node_mapping((int)lcpu, nid);
ret = NOTIFY_OK;
break;
#ifdef CONFIG_HOTPLUG_CPU
@@ -324,27 +624,40 @@ static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
* Returns the size the region should have to enforce the memory limit.
* This will either be the original value of size, a truncated value,
* or zero. If the returned value of size is 0 the region should be
- * discarded as it lies wholy above the memory limit.
+ * discarded as it lies wholly above the memory limit.
*/
static unsigned long __init numa_enforce_memory_limit(unsigned long start,
unsigned long size)
{
/*
- * We use lmb_end_of_DRAM() in here instead of memory_limit because
+ * We use memblock_end_of_DRAM() in here instead of memory_limit because
* we've already adjusted it for the limit and it takes care of
- * having memory holes below the limit.
+ * having memory holes below the limit. Also, in the case of
+ * iommu_is_off, memory_limit is not set but is implicitly enforced.
*/
- if (! memory_limit)
- return size;
-
- if (start + size <= lmb_end_of_DRAM())
+ if (start + size <= memblock_end_of_DRAM())
return size;
- if (start >= lmb_end_of_DRAM())
+ if (start >= memblock_end_of_DRAM())
return 0;
- return lmb_end_of_DRAM() - start;
+ return memblock_end_of_DRAM() - start;
+}
+
+/*
+ * Reads the counter for a given entry in
+ * linux,drconf-usable-memory property
+ */
+static inline int __init read_usm_ranges(const __be32 **usm)
+{
+ /*
+ * For each lmb in ibm,dynamic-memory a corresponding
+ * entry in linux,drconf-usable-memory property contains
+ * a counter followed by that many (base, size) duple.
+ * read the counter from linux,drconf-usable-memory
+ */
+ return read_n_cells(n_mem_size_cells, usm);
}
/*
@@ -353,64 +666,70 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start,
*/
static void __init parse_drconf_memory(struct device_node *memory)
{
- const unsigned int *lm, *dm, *aa;
- unsigned int ls, ld, la;
- unsigned int n, aam, aalen;
- unsigned long lmb_size, size, start;
- int nid, default_nid = 0;
- unsigned int ai, flags;
-
- lm = of_get_property(memory, "ibm,lmb-size", &ls);
- dm = of_get_property(memory, "ibm,dynamic-memory", &ld);
- aa = of_get_property(memory, "ibm,associativity-lookup-arrays", &la);
- if (!lm || !dm || !aa ||
- ls < sizeof(unsigned int) || ld < sizeof(unsigned int) ||
- la < 2 * sizeof(unsigned int))
+ const __be32 *uninitialized_var(dm), *usm;
+ unsigned int n, rc, ranges, is_kexec_kdump = 0;
+ unsigned long lmb_size, base, size, sz;
+ int nid;
+ struct assoc_arrays aa = { .arrays = NULL };
+
+ n = of_get_drconf_memory(memory, &dm);
+ if (!n)
+ return;
+
+ lmb_size = of_get_lmb_size(memory);
+ if (!lmb_size)
return;
- lmb_size = read_n_cells(n_mem_size_cells, &lm);
- n = *dm++; /* number of LMBs */
- aam = *aa++; /* number of associativity lists */
- aalen = *aa++; /* length of each associativity list */
- if (ld < (n * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int) ||
- la < (aam * aalen + 2) * sizeof(unsigned int))
+ rc = of_get_assoc_arrays(memory, &aa);
+ if (rc)
return;
+ /* check if this is a kexec/kdump kernel */
+ usm = of_get_usable_memory(memory);
+ if (usm != NULL)
+ is_kexec_kdump = 1;
+
for (; n != 0; --n) {
- start = read_n_cells(n_mem_addr_cells, &dm);
- ai = dm[2];
- flags = dm[3];
- dm += 4;
- /* 0x80 == reserved, 0x8 = assigned to us */
- if ((flags & 0x80) || !(flags & 0x8))
- continue;
- nid = default_nid;
- /* flags & 0x40 means associativity index is invalid */
- if (min_common_depth > 0 && min_common_depth <= aalen &&
- (flags & 0x40) == 0 && ai < aam) {
- /* this is like of_node_to_nid_single */
- nid = aa[ai * aalen + min_common_depth - 1];
- if (nid == 0xffff || nid >= MAX_NUMNODES)
- nid = default_nid;
- }
+ struct of_drconf_cell drmem;
- fake_numa_create_new_node(((start + lmb_size) >> PAGE_SHIFT),
- &nid);
- node_set_online(nid);
+ read_drconf_cell(&drmem, &dm);
- size = numa_enforce_memory_limit(start, lmb_size);
- if (!size)
+ /* skip this block if the reserved bit is set in flags (0x80)
+ or if the block is not assigned to this partition (0x8) */
+ if ((drmem.flags & DRCONF_MEM_RESERVED)
+ || !(drmem.flags & DRCONF_MEM_ASSIGNED))
continue;
- add_active_range(nid, start >> PAGE_SHIFT,
- (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
+ base = drmem.base_addr;
+ size = lmb_size;
+ ranges = 1;
+
+ if (is_kexec_kdump) {
+ ranges = read_usm_ranges(&usm);
+ if (!ranges) /* there are no (base, size) duple */
+ continue;
+ }
+ do {
+ if (is_kexec_kdump) {
+ base = read_n_cells(n_mem_addr_cells, &usm);
+ size = read_n_cells(n_mem_size_cells, &usm);
+ }
+ nid = of_drconf_to_nid_single(&drmem, &aa);
+ fake_numa_create_new_node(
+ ((base + size) >> PAGE_SHIFT),
+ &nid);
+ node_set_online(nid);
+ sz = numa_enforce_memory_limit(base, size);
+ if (sz)
+ memblock_set_node(base, sz,
+ &memblock.memory, nid);
+ } while (--ranges);
}
}
static int __init parse_numa_properties(void)
{
- struct device_node *cpu = NULL;
- struct device_node *memory = NULL;
+ struct device_node *memory;
int default_nid = 0;
unsigned long i;
@@ -432,9 +751,10 @@ static int __init parse_numa_properties(void)
* each node to be onlined must have NODE_DATA etc backing it.
*/
for_each_present_cpu(i) {
+ struct device_node *cpu;
int nid;
- cpu = find_cpu_node(i);
+ cpu = of_get_cpu_node(i, NULL);
BUG_ON(!cpu);
nid = of_node_to_nid_single(cpu);
of_node_put(cpu);
@@ -450,13 +770,13 @@ static int __init parse_numa_properties(void)
}
get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
- memory = NULL;
- while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+
+ for_each_node_by_type(memory, "memory") {
unsigned long start;
unsigned long size;
int nid;
int ranges;
- const unsigned int *memcell_buf;
+ const __be32 *memcell_buf;
unsigned int len;
memcell_buf = of_get_property(memory,
@@ -492,16 +812,16 @@ new_range:
continue;
}
- add_active_range(nid, start >> PAGE_SHIFT,
- (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
+ memblock_set_node(start, size, &memblock.memory, nid);
if (--ranges)
goto new_range;
}
/*
- * Now do the same thing for each LMB listed in the ibm,dynamic-memory
- * property in the ibm,dynamic-reconfiguration-memory node.
+ * Now do the same thing for each MEMBLOCK listed in the
+ * ibm,dynamic-memory property in the
+ * ibm,dynamic-reconfiguration-memory node.
*/
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory)
@@ -512,22 +832,25 @@ new_range:
static void __init setup_nonnuma(void)
{
- unsigned long top_of_ram = lmb_end_of_DRAM();
- unsigned long total_ram = lmb_phys_mem_size();
+ unsigned long top_of_ram = memblock_end_of_DRAM();
+ unsigned long total_ram = memblock_phys_mem_size();
unsigned long start_pfn, end_pfn;
- unsigned int i, nid = 0;
+ unsigned int nid = 0;
+ struct memblock_region *reg;
printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
top_of_ram, total_ram);
printk(KERN_DEBUG "Memory hole size: %ldMB\n",
(top_of_ram - total_ram) >> 20);
- for (i = 0; i < lmb.memory.cnt; ++i) {
- start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
- end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
+ for_each_memblock(memory, reg) {
+ start_pfn = memblock_region_memory_base_pfn(reg);
+ end_pfn = memblock_region_memory_end_pfn(reg);
fake_numa_create_new_node(end_pfn, &nid);
- add_active_range(nid, start_pfn, end_pfn);
+ memblock_set_node(PFN_PHYS(start_pfn),
+ PFN_PHYS(end_pfn - start_pfn),
+ &memblock.memory, nid);
node_set_online(nid);
}
}
@@ -548,8 +871,9 @@ void __init dump_numa_cpu_topology(void)
* If we used a CPU iterator here we would miss printing
* the holes in the cpumap.
*/
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ if (cpumask_test_cpu(cpu,
+ node_to_cpumask_map[node])) {
if (count == 0)
printk(" %u", cpu);
++count;
@@ -561,7 +885,7 @@ void __init dump_numa_cpu_topology(void)
}
if (count > 1)
- printk("-%u", NR_CPUS - 1);
+ printk("-%u", nr_cpu_ids - 1);
printk("\n");
}
}
@@ -581,7 +905,7 @@ static void __init dump_numa_memory_topology(void)
count = 0;
- for (i = 0; i < lmb_end_of_DRAM();
+ for (i = 0; i < memblock_end_of_DRAM();
i += (1 << SECTION_SIZE_BITS)) {
if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
if (count == 0)
@@ -601,60 +925,134 @@ static void __init dump_numa_memory_topology(void)
}
/*
- * Allocate some memory, satisfying the lmb or bootmem allocator where
+ * Allocate some memory, satisfying the memblock or bootmem allocator where
* required. nid is the preferred node and end is the physical address of
* the highest address in the node.
*
- * Returns the physical address of the memory.
+ * Returns the virtual address of the memory.
*/
-static void __init *careful_allocation(int nid, unsigned long size,
+static void __init *careful_zallocation(int nid, unsigned long size,
unsigned long align,
unsigned long end_pfn)
{
+ void *ret;
int new_nid;
- unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
+ unsigned long ret_paddr;
+
+ ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
/* retry over all memory */
- if (!ret)
- ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM());
+ if (!ret_paddr)
+ ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
- if (!ret)
- panic("numa.c: cannot allocate %lu bytes on node %d",
+ if (!ret_paddr)
+ panic("numa.c: cannot allocate %lu bytes for node %d",
size, nid);
+ ret = __va(ret_paddr);
+
/*
- * If the memory came from a previously allocated node, we must
- * retry with the bootmem allocator.
+ * We initialize the nodes in numeric order: 0, 1, 2...
+ * and hand over control from the MEMBLOCK allocator to the
+ * bootmem allocator. If this function is called for
+ * node 5, then we know that all nodes <5 are using the
+ * bootmem allocator instead of the MEMBLOCK allocator.
+ *
+ * So, check the nid from which this allocation came
+ * and double check to see if we need to use bootmem
+ * instead of the MEMBLOCK. We don't free the MEMBLOCK memory
+ * since it would be useless.
*/
- new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT);
+ new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
if (new_nid < nid) {
- ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid),
+ ret = __alloc_bootmem_node(NODE_DATA(new_nid),
size, align, 0);
- if (!ret)
- panic("numa.c: cannot allocate %lu bytes on node %d",
- size, new_nid);
-
- ret = __pa(ret);
-
- dbg("alloc_bootmem %lx %lx\n", ret, size);
+ dbg("alloc_bootmem %p %lx\n", ret, size);
}
- return (void *)ret;
+ memset(ret, 0, size);
+ return ret;
}
-static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+static struct notifier_block ppc64_numa_nb = {
.notifier_call = cpu_numa_callback,
.priority = 1 /* Must run before sched domains notifier. */
};
+static void __init mark_reserved_regions_for_nid(int nid)
+{
+ struct pglist_data *node = NODE_DATA(nid);
+ struct memblock_region *reg;
+
+ for_each_memblock(reserved, reg) {
+ unsigned long physbase = reg->base;
+ unsigned long size = reg->size;
+ unsigned long start_pfn = physbase >> PAGE_SHIFT;
+ unsigned long end_pfn = PFN_UP(physbase + size);
+ struct node_active_region node_ar;
+ unsigned long node_end_pfn = pgdat_end_pfn(node);
+
+ /*
+ * Check to make sure that this memblock.reserved area is
+ * within the bounds of the node that we care about.
+ * Checking the nid of the start and end points is not
+ * sufficient because the reserved area could span the
+ * entire node.
+ */
+ if (end_pfn <= node->node_start_pfn ||
+ start_pfn >= node_end_pfn)
+ continue;
+
+ get_node_active_region(start_pfn, &node_ar);
+ while (start_pfn < end_pfn &&
+ node_ar.start_pfn < node_ar.end_pfn) {
+ unsigned long reserve_size = size;
+ /*
+ * if reserved region extends past active region
+ * then trim size to active region
+ */
+ if (end_pfn > node_ar.end_pfn)
+ reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
+ - physbase;
+ /*
+ * Only worry about *this* node, others may not
+ * yet have valid NODE_DATA().
+ */
+ if (node_ar.nid == nid) {
+ dbg("reserve_bootmem %lx %lx nid=%d\n",
+ physbase, reserve_size, node_ar.nid);
+ reserve_bootmem_node(NODE_DATA(node_ar.nid),
+ physbase, reserve_size,
+ BOOTMEM_DEFAULT);
+ }
+ /*
+ * if reserved region is contained in the active region
+ * then done.
+ */
+ if (end_pfn <= node_ar.end_pfn)
+ break;
+
+ /*
+ * reserved region extends past the active region
+ * get next active region that contains this
+ * reserved region
+ */
+ start_pfn = node_ar.end_pfn;
+ physbase = start_pfn << PAGE_SHIFT;
+ size = size - reserve_size;
+ get_node_active_region(start_pfn, &node_ar);
+ }
+ }
+}
+
+
void __init do_init_bootmem(void)
{
int nid;
- unsigned int i;
min_low_pfn = 0;
- max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+ max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
max_pfn = max_low_pfn;
if (parse_numa_properties())
@@ -662,28 +1060,28 @@ void __init do_init_bootmem(void)
else
dump_numa_memory_topology();
- register_cpu_notifier(&ppc64_numa_nb);
- cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
- (void *)(unsigned long)boot_cpuid);
-
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;
- unsigned long bootmem_paddr;
+ void *bootmem_vaddr;
unsigned long bootmap_pages;
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
- /* Allocate the node structure node local if possible */
- NODE_DATA(nid) = careful_allocation(nid,
+ /*
+ * Allocate the node structure node local if possible
+ *
+ * Be careful moving this around, as it relies on all
+ * previous nodes' bootmem to be initialized and have
+ * all reserved areas marked.
+ */
+ NODE_DATA(nid) = careful_zallocation(nid,
sizeof(struct pglist_data),
SMP_CACHE_BYTES, end_pfn);
- NODE_DATA(nid) = __va(NODE_DATA(nid));
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
dbg("node %d\n", nid);
dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
- NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+ NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
@@ -694,56 +1092,45 @@ void __init do_init_bootmem(void)
dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
- bootmem_paddr = (unsigned long)careful_allocation(nid,
+ bootmem_vaddr = careful_zallocation(nid,
bootmap_pages << PAGE_SHIFT,
PAGE_SIZE, end_pfn);
- memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT);
- dbg("bootmap_paddr = %lx\n", bootmem_paddr);
+ dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
- init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
+ init_bootmem_node(NODE_DATA(nid),
+ __pa(bootmem_vaddr) >> PAGE_SHIFT,
start_pfn, end_pfn);
free_bootmem_with_active_regions(nid, end_pfn);
+ /*
+ * Be very careful about moving this around. Future
+ * calls to careful_zallocation() depend on this getting
+ * done correctly.
+ */
+ mark_reserved_regions_for_nid(nid);
+ sparse_memory_present_with_active_regions(nid);
+ }
- /* Mark reserved regions on this node */
- for (i = 0; i < lmb.reserved.cnt; i++) {
- unsigned long physbase = lmb.reserved.region[i].base;
- unsigned long size = lmb.reserved.region[i].size;
- unsigned long start_paddr = start_pfn << PAGE_SHIFT;
- unsigned long end_paddr = end_pfn << PAGE_SHIFT;
-
- if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
- early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
- continue;
+ init_bootmem_done = 1;
- if (physbase < end_paddr &&
- (physbase+size) > start_paddr) {
- /* overlaps */
- if (physbase < start_paddr) {
- size -= start_paddr - physbase;
- physbase = start_paddr;
- }
-
- if (size > end_paddr - physbase)
- size = end_paddr - physbase;
-
- dbg("reserve_bootmem %lx %lx\n", physbase,
- size);
- reserve_bootmem_node(NODE_DATA(nid), physbase,
- size, BOOTMEM_DEFAULT);
- }
- }
+ /*
+ * Now bootmem is initialised we can create the node to cpumask
+ * lookup tables and setup the cpu callback to populate them.
+ */
+ setup_node_to_cpumask_map();
- sparse_memory_present_with_active_regions(nid);
- }
+ reset_numa_cpu_lookup_table();
+ register_cpu_notifier(&ppc64_numa_nb);
+ cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
+ (void *)(unsigned long)boot_cpuid);
}
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
+ max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
free_area_init_nodes(max_zone_pfns);
}
@@ -768,24 +1155,67 @@ early_param("numa", early_numa);
#ifdef CONFIG_MEMORY_HOTPLUG
/*
- * Find the node associated with a hot added memory section. Section
- * corresponds to a SPARSEMEM section, not an LMB. It is assumed that
- * sections are fully contained within a single LMB.
+ * Find the node associated with a hot added memory section for
+ * memory represented in the device tree by the property
+ * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
*/
-int hot_add_scn_to_nid(unsigned long scn_addr)
+static int hot_add_drconf_scn_to_nid(struct device_node *memory,
+ unsigned long scn_addr)
{
- struct device_node *memory = NULL;
- nodemask_t nodes;
- int default_nid = any_online_node(NODE_MASK_ALL);
- int nid;
+ const __be32 *dm;
+ unsigned int drconf_cell_cnt, rc;
+ unsigned long lmb_size;
+ struct assoc_arrays aa;
+ int nid = -1;
- if (!numa_enabled || (min_common_depth < 0))
- return default_nid;
+ drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
+ if (!drconf_cell_cnt)
+ return -1;
+
+ lmb_size = of_get_lmb_size(memory);
+ if (!lmb_size)
+ return -1;
+
+ rc = of_get_assoc_arrays(memory, &aa);
+ if (rc)
+ return -1;
- while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+ for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
+ struct of_drconf_cell drmem;
+
+ read_drconf_cell(&drmem, &dm);
+
+ /* skip this block if it is reserved or not assigned to
+ * this partition */
+ if ((drmem.flags & DRCONF_MEM_RESERVED)
+ || !(drmem.flags & DRCONF_MEM_ASSIGNED))
+ continue;
+
+ if ((scn_addr < drmem.base_addr)
+ || (scn_addr >= (drmem.base_addr + lmb_size)))
+ continue;
+
+ nid = of_drconf_to_nid_single(&drmem, &aa);
+ break;
+ }
+
+ return nid;
+}
+
+/*
+ * Find the node associated with a hot added memory section for memory
+ * represented in the device tree as a node (i.e. memory@XXXX) for
+ * each memblock.
+ */
+static int hot_add_node_scn_to_nid(unsigned long scn_addr)
+{
+ struct device_node *memory;
+ int nid = -1;
+
+ for_each_node_by_type(memory, "memory") {
unsigned long start, size;
int ranges;
- const unsigned int *memcell_buf;
+ const __be32 *memcell_buf;
unsigned int len;
memcell_buf = of_get_property(memory, "reg", &len);
@@ -794,33 +1224,586 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
/* ranges in cell */
ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
-ha_new_range:
- start = read_n_cells(n_mem_addr_cells, &memcell_buf);
- size = read_n_cells(n_mem_size_cells, &memcell_buf);
- nid = of_node_to_nid_single(memory);
- /* Domains not present at boot default to 0 */
- if (nid < 0 || !node_online(nid))
- nid = default_nid;
+ while (ranges--) {
+ start = read_n_cells(n_mem_addr_cells, &memcell_buf);
+ size = read_n_cells(n_mem_size_cells, &memcell_buf);
+
+ if ((scn_addr < start) || (scn_addr >= (start + size)))
+ continue;
- if ((scn_addr >= start) && (scn_addr < (start + size))) {
- of_node_put(memory);
- goto got_nid;
+ nid = of_node_to_nid_single(memory);
+ break;
}
- if (--ranges) /* process all ranges in cell */
- goto ha_new_range;
+ if (nid >= 0)
+ break;
}
- BUG(); /* section address should be found above */
- return 0;
- /* Temporary code to ensure that returned node is not empty */
-got_nid:
- nodes_setall(nodes);
- while (NODE_DATA(nid)->node_spanned_pages == 0) {
- node_clear(nid, nodes);
- nid = any_online_node(nodes);
+ of_node_put(memory);
+
+ return nid;
+}
+
+/*
+ * Find the node associated with a hot added memory section. Section
+ * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
+ * sections are fully contained within a single MEMBLOCK.
+ */
+int hot_add_scn_to_nid(unsigned long scn_addr)
+{
+ struct device_node *memory = NULL;
+ int nid, found = 0;
+
+ if (!numa_enabled || (min_common_depth < 0))
+ return first_online_node;
+
+ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (memory) {
+ nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
+ of_node_put(memory);
+ } else {
+ nid = hot_add_node_scn_to_nid(scn_addr);
}
+
+ if (nid < 0 || !node_online(nid))
+ nid = first_online_node;
+
+ if (NODE_DATA(nid)->node_spanned_pages)
+ return nid;
+
+ for_each_online_node(nid) {
+ if (NODE_DATA(nid)->node_spanned_pages) {
+ found = 1;
+ break;
+ }
+ }
+
+ BUG_ON(!found);
return nid;
}
+
+static u64 hot_add_drconf_memory_max(void)
+{
+ struct device_node *memory = NULL;
+ unsigned int drconf_cell_cnt = 0;
+ u64 lmb_size = 0;
+ const __be32 *dm = NULL;
+
+ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (memory) {
+ drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
+ lmb_size = of_get_lmb_size(memory);
+ of_node_put(memory);
+ }
+ return lmb_size * drconf_cell_cnt;
+}
+
+/*
+ * memory_hotplug_max - return max address of memory that may be added
+ *
+ * This is currently only used on systems that support drconfig memory
+ * hotplug.
+ */
+u64 memory_hotplug_max(void)
+{
+ return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
+}
#endif /* CONFIG_MEMORY_HOTPLUG */
+
+/* Virtual Processor Home Node (VPHN) support */
+#ifdef CONFIG_PPC_SPLPAR
+struct topology_update_data {
+ struct topology_update_data *next;
+ unsigned int cpu;
+ int old_nid;
+ int new_nid;
+};
+
+static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
+static cpumask_t cpu_associativity_changes_mask;
+static int vphn_enabled;
+static int prrn_enabled;
+static void reset_topology_timer(void);
+
+/*
+ * Store the current values of the associativity change counters in the
+ * hypervisor.
+ */
+static void setup_cpu_associativity_change_counters(void)
+{
+ int cpu;
+
+ /* The VPHN feature supports a maximum of 8 reference points */
+ BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ u8 *counts = vphn_cpu_change_counts[cpu];
+ volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
+
+ for (i = 0; i < distance_ref_points_depth; i++)
+ counts[i] = hypervisor_counts[i];
+ }
+}
+
+/*
+ * The hypervisor maintains a set of 8 associativity change counters in
+ * the VPA of each cpu that correspond to the associativity levels in the
+ * ibm,associativity-reference-points property. When an associativity
+ * level changes, the corresponding counter is incremented.
+ *
+ * Set a bit in cpu_associativity_changes_mask for each cpu whose home
+ * node associativity levels have changed.
+ *
+ * Returns the number of cpus with unhandled associativity changes.
+ */
+static int update_cpu_associativity_changes_mask(void)
+{
+ int cpu;
+ cpumask_t *changes = &cpu_associativity_changes_mask;
+
+ for_each_possible_cpu(cpu) {
+ int i, changed = 0;
+ u8 *counts = vphn_cpu_change_counts[cpu];
+ volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
+
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ if (hypervisor_counts[i] != counts[i]) {
+ counts[i] = hypervisor_counts[i];
+ changed = 1;
+ }
+ }
+ if (changed) {
+ cpumask_or(changes, changes, cpu_sibling_mask(cpu));
+ cpu = cpu_last_thread_sibling(cpu);
+ }
+ }
+
+ return cpumask_weight(changes);
+}
+
+/*
+ * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
+ * the complete property we have to add the length in the first cell.
+ */
+#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
+
+/*
+ * Convert the associativity domain numbers returned from the hypervisor
+ * to the sequence they would appear in the ibm,associativity property.
+ */
+static int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
+{
+ int i, nr_assoc_doms = 0;
+ const __be16 *field = (const __be16 *) packed;
+
+#define VPHN_FIELD_UNUSED (0xffff)
+#define VPHN_FIELD_MSB (0x8000)
+#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
+
+ for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
+ if (be16_to_cpup(field) == VPHN_FIELD_UNUSED) {
+ /* All significant fields processed, and remaining
+ * fields contain the reserved value of all 1's.
+ * Just store them.
+ */
+ unpacked[i] = *((__be32 *)field);
+ field += 2;
+ } else if (be16_to_cpup(field) & VPHN_FIELD_MSB) {
+ /* Data is in the lower 15 bits of this field */
+ unpacked[i] = cpu_to_be32(
+ be16_to_cpup(field) & VPHN_FIELD_MASK);
+ field++;
+ nr_assoc_doms++;
+ } else {
+ /* Data is in the lower 15 bits of this field
+ * concatenated with the next 16 bit field
+ */
+ unpacked[i] = *((__be32 *)field);
+ field += 2;
+ nr_assoc_doms++;
+ }
+ }
+
+ /* The first cell contains the length of the property */
+ unpacked[0] = cpu_to_be32(nr_assoc_doms);
+
+ return nr_assoc_doms;
+}
+
+/*
+ * Retrieve the new associativity information for a virtual processor's
+ * home node.
+ */
+static long hcall_vphn(unsigned long cpu, __be32 *associativity)
+{
+ long rc;
+ long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
+ u64 flags = 1;
+ int hwcpu = get_hard_smp_processor_id(cpu);
+
+ rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
+ vphn_unpack_associativity(retbuf, associativity);
+
+ return rc;
+}
+
+static long vphn_get_associativity(unsigned long cpu,
+ __be32 *associativity)
+{
+ long rc;
+
+ rc = hcall_vphn(cpu, associativity);
+
+ switch (rc) {
+ case H_FUNCTION:
+ printk(KERN_INFO
+ "VPHN is not supported. Disabling polling...\n");
+ stop_topology_update();
+ break;
+ case H_HARDWARE:
+ printk(KERN_ERR
+ "hcall_vphn() experienced a hardware fault "
+ "preventing VPHN. Disabling polling...\n");
+ stop_topology_update();
+ }
+
+ return rc;
+}
+
+/*
+ * Update the CPU maps and sysfs entries for a single CPU when its NUMA
+ * characteristics change. This function doesn't perform any locking and is
+ * only safe to call from stop_machine().
+ */
+static int update_cpu_topology(void *data)
+{
+ struct topology_update_data *update;
+ unsigned long cpu;
+
+ if (!data)
+ return -EINVAL;
+
+ cpu = smp_processor_id();
+
+ for (update = data; update; update = update->next) {
+ if (cpu != update->cpu)
+ continue;
+
+ unmap_cpu_from_node(update->cpu);
+ map_cpu_to_node(update->cpu, update->new_nid);
+ vdso_getcpu_init();
+ }
+
+ return 0;
+}
+
+static int update_lookup_table(void *data)
+{
+ struct topology_update_data *update;
+
+ if (!data)
+ return -EINVAL;
+
+ /*
+ * Upon topology update, the numa-cpu lookup table needs to be updated
+ * for all threads in the core, including offline CPUs, to ensure that
+ * future hotplug operations respect the cpu-to-node associativity
+ * properly.
+ */
+ for (update = data; update; update = update->next) {
+ int nid, base, j;
+
+ nid = update->new_nid;
+ base = cpu_first_thread_sibling(update->cpu);
+
+ for (j = 0; j < threads_per_core; j++) {
+ update_numa_cpu_lookup_table(base + j, nid);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Update the node maps and sysfs entries for each cpu whose home node
+ * has changed. Returns 1 when the topology has changed, and 0 otherwise.
+ */
+int arch_update_cpu_topology(void)
+{
+ unsigned int cpu, sibling, changed = 0;
+ struct topology_update_data *updates, *ud;
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ cpumask_t updated_cpus;
+ struct device *dev;
+ int weight, new_nid, i = 0;
+
+ weight = cpumask_weight(&cpu_associativity_changes_mask);
+ if (!weight)
+ return 0;
+
+ updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
+ if (!updates)
+ return 0;
+
+ cpumask_clear(&updated_cpus);
+
+ for_each_cpu(cpu, &cpu_associativity_changes_mask) {
+ /*
+ * If siblings aren't flagged for changes, updates list
+ * will be too short. Skip on this update and set for next
+ * update.
+ */
+ if (!cpumask_subset(cpu_sibling_mask(cpu),
+ &cpu_associativity_changes_mask)) {
+ pr_info("Sibling bits not set for associativity "
+ "change, cpu%d\n", cpu);
+ cpumask_or(&cpu_associativity_changes_mask,
+ &cpu_associativity_changes_mask,
+ cpu_sibling_mask(cpu));
+ cpu = cpu_last_thread_sibling(cpu);
+ continue;
+ }
+
+ /* Use associativity from first thread for all siblings */
+ vphn_get_associativity(cpu, associativity);
+ new_nid = associativity_to_nid(associativity);
+ if (new_nid < 0 || !node_online(new_nid))
+ new_nid = first_online_node;
+
+ if (new_nid == numa_cpu_lookup_table[cpu]) {
+ cpumask_andnot(&cpu_associativity_changes_mask,
+ &cpu_associativity_changes_mask,
+ cpu_sibling_mask(cpu));
+ cpu = cpu_last_thread_sibling(cpu);
+ continue;
+ }
+
+ for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
+ ud = &updates[i++];
+ ud->cpu = sibling;
+ ud->new_nid = new_nid;
+ ud->old_nid = numa_cpu_lookup_table[sibling];
+ cpumask_set_cpu(sibling, &updated_cpus);
+ if (i < weight)
+ ud->next = &updates[i];
+ }
+ cpu = cpu_last_thread_sibling(cpu);
+ }
+
+ /*
+ * In cases where we have nothing to update (because the updates list
+ * is too short or because the new topology is same as the old one),
+ * skip invoking update_cpu_topology() via stop-machine(). This is
+ * necessary (and not just a fast-path optimization) since stop-machine
+ * can end up electing a random CPU to run update_cpu_topology(), and
+ * thus trick us into setting up incorrect cpu-node mappings (since
+ * 'updates' is kzalloc()'ed).
+ *
+ * And for the similar reason, we will skip all the following updating.
+ */
+ if (!cpumask_weight(&updated_cpus))
+ goto out;
+
+ stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
+
+ /*
+ * Update the numa-cpu lookup table with the new mappings, even for
+ * offline CPUs. It is best to perform this update from the stop-
+ * machine context.
+ */
+ stop_machine(update_lookup_table, &updates[0],
+ cpumask_of(raw_smp_processor_id()));
+
+ for (ud = &updates[0]; ud; ud = ud->next) {
+ unregister_cpu_under_node(ud->cpu, ud->old_nid);
+ register_cpu_under_node(ud->cpu, ud->new_nid);
+
+ dev = get_cpu_device(ud->cpu);
+ if (dev)
+ kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+ cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
+ changed = 1;
+ }
+
+out:
+ kfree(updates);
+ return changed;
+}
+
+static void topology_work_fn(struct work_struct *work)
+{
+ rebuild_sched_domains();
+}
+static DECLARE_WORK(topology_work, topology_work_fn);
+
+static void topology_schedule_update(void)
+{
+ schedule_work(&topology_work);
+}
+
+static void topology_timer_fn(unsigned long ignored)
+{
+ if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
+ topology_schedule_update();
+ else if (vphn_enabled) {
+ if (update_cpu_associativity_changes_mask() > 0)
+ topology_schedule_update();
+ reset_topology_timer();
+ }
+}
+static struct timer_list topology_timer =
+ TIMER_INITIALIZER(topology_timer_fn, 0, 0);
+
+static void reset_topology_timer(void)
+{
+ topology_timer.data = 0;
+ topology_timer.expires = jiffies + 60 * HZ;
+ mod_timer(&topology_timer, topology_timer.expires);
+}
+
+#ifdef CONFIG_SMP
+
+static void stage_topology_update(int core_id)
+{
+ cpumask_or(&cpu_associativity_changes_mask,
+ &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
+ reset_topology_timer();
+}
+
+static int dt_update_callback(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct of_prop_reconfig *update;
+ int rc = NOTIFY_DONE;
+
+ switch (action) {
+ case OF_RECONFIG_UPDATE_PROPERTY:
+ update = (struct of_prop_reconfig *)data;
+ if (!of_prop_cmp(update->dn->type, "cpu") &&
+ !of_prop_cmp(update->prop->name, "ibm,associativity")) {
+ u32 core_id;
+ of_property_read_u32(update->dn, "reg", &core_id);
+ stage_topology_update(core_id);
+ rc = NOTIFY_OK;
+ }
+ break;
+ }
+
+ return rc;
+}
+
+static struct notifier_block dt_update_nb = {
+ .notifier_call = dt_update_callback,
+};
+
+#endif
+
+/*
+ * Start polling for associativity changes.
+ */
+int start_topology_update(void)
+{
+ int rc = 0;
+
+ if (firmware_has_feature(FW_FEATURE_PRRN)) {
+ if (!prrn_enabled) {
+ prrn_enabled = 1;
+ vphn_enabled = 0;
+#ifdef CONFIG_SMP
+ rc = of_reconfig_notifier_register(&dt_update_nb);
+#endif
+ }
+ } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
+ lppaca_shared_proc(get_lppaca())) {
+ if (!vphn_enabled) {
+ prrn_enabled = 0;
+ vphn_enabled = 1;
+ setup_cpu_associativity_change_counters();
+ init_timer_deferrable(&topology_timer);
+ reset_topology_timer();
+ }
+ }
+
+ return rc;
+}
+
+/*
+ * Disable polling for VPHN associativity changes.
+ */
+int stop_topology_update(void)
+{
+ int rc = 0;
+
+ if (prrn_enabled) {
+ prrn_enabled = 0;
+#ifdef CONFIG_SMP
+ rc = of_reconfig_notifier_unregister(&dt_update_nb);
+#endif
+ } else if (vphn_enabled) {
+ vphn_enabled = 0;
+ rc = del_timer_sync(&topology_timer);
+ }
+
+ return rc;
+}
+
+int prrn_is_enabled(void)
+{
+ return prrn_enabled;
+}
+
+static int topology_read(struct seq_file *file, void *v)
+{
+ if (vphn_enabled || prrn_enabled)
+ seq_puts(file, "on\n");
+ else
+ seq_puts(file, "off\n");
+
+ return 0;
+}
+
+static int topology_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, topology_read, NULL);
+}
+
+static ssize_t topology_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *off)
+{
+ char kbuf[4]; /* "on" or "off" plus null. */
+ int read_len;
+
+ read_len = count < 3 ? count : 3;
+ if (copy_from_user(kbuf, buf, read_len))
+ return -EINVAL;
+
+ kbuf[read_len] = '\0';
+
+ if (!strncmp(kbuf, "on", 2))
+ start_topology_update();
+ else if (!strncmp(kbuf, "off", 3))
+ stop_topology_update();
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static const struct file_operations topology_ops = {
+ .read = seq_read,
+ .write = topology_write,
+ .open = topology_open,
+ .release = single_release
+};
+
+static int topology_update_init(void)
+{
+ start_topology_update();
+ proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops);
+
+ return 0;
+}
+device_initcall(topology_update_init);
+#endif /* CONFIG_PPC_SPLPAR */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
new file mode 100644
index 00000000000..c695943a513
--- /dev/null
+++ b/arch/powerpc/mm/pgtable.c
@@ -0,0 +1,236 @@
+/*
+ * This file contains common routines for dealing with free of page tables
+ * Along with common page table handling code
+ *
+ * Derived from arch/powerpc/mm/tlb_64.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Dave Engebretsen <engebret@us.ibm.com>
+ * Rework for PPC64 port.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+static inline int is_exec_fault(void)
+{
+ return current->thread.regs && TRAP(current->thread.regs) == 0x400;
+}
+
+/* We only try to do i/d cache coherency on stuff that looks like
+ * reasonably "normal" PTEs. We currently require a PTE to be present
+ * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that
+ * on userspace PTEs
+ */
+static inline int pte_looks_normal(pte_t pte)
+{
+ return (pte_val(pte) &
+ (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
+ (_PAGE_PRESENT | _PAGE_USER);
+}
+
+struct page * maybe_pte_to_page(pte_t pte)
+{
+ unsigned long pfn = pte_pfn(pte);
+ struct page *page;
+
+ if (unlikely(!pfn_valid(pfn)))
+ return NULL;
+ page = pfn_to_page(pfn);
+ if (PageReserved(page))
+ return NULL;
+ return page;
+}
+
+#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0
+
+/* Server-style MMU handles coherency when hashing if HW exec permission
+ * is supposed per page (currently 64-bit only). If not, then, we always
+ * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
+ * support falls into the same category.
+ */
+
+static pte_t set_pte_filter(pte_t pte)
+{
+ pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+ if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
+ cpu_has_feature(CPU_FTR_NOEXECUTE))) {
+ struct page *pg = maybe_pte_to_page(pte);
+ if (!pg)
+ return pte;
+ if (!test_bit(PG_arch_1, &pg->flags)) {
+ flush_dcache_icache_page(pg);
+ set_bit(PG_arch_1, &pg->flags);
+ }
+ }
+ return pte;
+}
+
+static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
+ int dirty)
+{
+ return pte;
+}
+
+#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */
+
+/* Embedded type MMU with HW exec support. This is a bit more complicated
+ * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
+ * instead we "filter out" the exec permission for non clean pages.
+ */
+static pte_t set_pte_filter(pte_t pte)
+{
+ struct page *pg;
+
+ /* No exec permission in the first place, move on */
+ if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte))
+ return pte;
+
+ /* If you set _PAGE_EXEC on weird pages you're on your own */
+ pg = maybe_pte_to_page(pte);
+ if (unlikely(!pg))
+ return pte;
+
+ /* If the page clean, we move on */
+ if (test_bit(PG_arch_1, &pg->flags))
+ return pte;
+
+ /* If it's an exec fault, we flush the cache and make it clean */
+ if (is_exec_fault()) {
+ flush_dcache_icache_page(pg);
+ set_bit(PG_arch_1, &pg->flags);
+ return pte;
+ }
+
+ /* Else, we filter out _PAGE_EXEC */
+ return __pte(pte_val(pte) & ~_PAGE_EXEC);
+}
+
+static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
+ int dirty)
+{
+ struct page *pg;
+
+ /* So here, we only care about exec faults, as we use them
+ * to recover lost _PAGE_EXEC and perform I$/D$ coherency
+ * if necessary. Also if _PAGE_EXEC is already set, same deal,
+ * we just bail out
+ */
+ if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault())
+ return pte;
+
+#ifdef CONFIG_DEBUG_VM
+ /* So this is an exec fault, _PAGE_EXEC is not set. If it was
+ * an error we would have bailed out earlier in do_page_fault()
+ * but let's make sure of it
+ */
+ if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
+ return pte;
+#endif /* CONFIG_DEBUG_VM */
+
+ /* If you set _PAGE_EXEC on weird pages you're on your own */
+ pg = maybe_pte_to_page(pte);
+ if (unlikely(!pg))
+ goto bail;
+
+ /* If the page is already clean, we move on */
+ if (test_bit(PG_arch_1, &pg->flags))
+ goto bail;
+
+ /* Clean the page and set PG_arch_1 */
+ flush_dcache_icache_page(pg);
+ set_bit(PG_arch_1, &pg->flags);
+
+ bail:
+ return __pte(pte_val(pte) | _PAGE_EXEC);
+}
+
+#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */
+
+/*
+ * set_pte stores a linux PTE into the linux page table.
+ */
+void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte)
+{
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
+#endif
+ /* Note: mm->context.id might not yet have been assigned as
+ * this context might not have been activated yet when this
+ * is called.
+ */
+ pte = set_pte_filter(pte);
+
+ /* Perform the setting of the PTE */
+ __set_pte_at(mm, addr, ptep, pte, 0);
+}
+
+/*
+ * This is called when relaxing access to a PTE. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep, pte_t entry, int dirty)
+{
+ int changed;
+ entry = set_access_flags_filter(entry, vma, dirty);
+ changed = !pte_same(*(ptep), entry);
+ if (changed) {
+ if (!is_vm_hugetlb_page(vma))
+ assert_pte_locked(vma->vm_mm, address);
+ __ptep_set_access_flags(ptep, entry);
+ flush_tlb_page_nohash(vma, address);
+ }
+ return changed;
+}
+
+#ifdef CONFIG_DEBUG_VM
+void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (mm == &init_mm)
+ return;
+ pgd = mm->pgd + pgd_index(addr);
+ BUG_ON(pgd_none(*pgd));
+ pud = pud_offset(pgd, addr);
+ BUG_ON(pud_none(*pud));
+ pmd = pmd_offset(pud, addr);
+ /*
+ * khugepaged to collapse normal pages to hugepage, first set
+ * pmd to none to force page fault/gup to take mmap_sem. After
+ * pmd is set to none, we do a pte_clear which does this assertion
+ * so if we find pmd none, return.
+ */
+ if (pmd_none(*pmd))
+ return;
+ BUG_ON(!pmd_present(*pmd));
+ assert_spin_locked(pte_lockptr(mm, pmd));
+}
+#endif /* CONFIG_DEBUG_VM */
+
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index ac3390f8190..343a87fa78b 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -26,10 +26,14 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
#include <asm/io.h>
+#include <asm/setup.h>
#include "mmu_decl.h"
@@ -47,14 +51,10 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */
extern char etext[], _stext[];
-#ifdef CONFIG_SMP
-extern void hash_page_sync(void);
-#endif
-
#ifdef HAVE_BATS
-extern unsigned long v_mapped_by_bats(unsigned long va);
-extern unsigned long p_mapped_by_bats(unsigned long pa);
-void setbat(int index, unsigned long virt, unsigned long phys,
+extern phys_addr_t v_mapped_by_bats(unsigned long va);
+extern unsigned long p_mapped_by_bats(phys_addr_t pa);
+void setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, int flags);
#else /* !HAVE_BATS */
@@ -64,31 +64,36 @@ void setbat(int index, unsigned long virt, unsigned long phys,
#ifdef HAVE_TLBCAM
extern unsigned int tlbcam_index;
-extern unsigned long v_mapped_by_tlbcam(unsigned long va);
-extern unsigned long p_mapped_by_tlbcam(unsigned long pa);
+extern phys_addr_t v_mapped_by_tlbcam(unsigned long va);
+extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa);
#else /* !HAVE_TLBCAM */
#define v_mapped_by_tlbcam(x) (0UL)
#define p_mapped_by_tlbcam(x) (0UL)
#endif /* HAVE_TLBCAM */
-#ifdef CONFIG_PTE_64BIT
-/* 44x uses an 8kB pgdir because it has 8-byte Linux PTEs. */
-#define PGDIR_ORDER 1
-#else
-#define PGDIR_ORDER 0
-#endif
+#define PGDIR_ORDER (32 + PGD_T_LOG2 - PGDIR_SHIFT)
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *ret;
- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
+ /* pgdir take page or two with 4K pages and a page fraction otherwise */
+#ifndef CONFIG_PPC_4K_PAGES
+ ret = kzalloc(1 << PGDIR_ORDER, GFP_KERNEL);
+#else
+ ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
+ PGDIR_ORDER - PAGE_SHIFT);
+#endif
return ret;
}
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- free_pages((unsigned long)pgd, PGDIR_ORDER);
+#ifndef CONFIG_PPC_4K_PAGES
+ kfree((void *)pgd);
+#else
+ free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT);
+#endif
}
__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -111,57 +116,78 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *ptepage;
-#ifdef CONFIG_HIGHPTE
- gfp_t flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT | __GFP_ZERO;
-#else
gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO;
-#endif
ptepage = alloc_pages(flags, 0);
if (!ptepage)
return NULL;
- pgtable_page_ctor(ptepage);
+ if (!pgtable_page_ctor(ptepage)) {
+ __free_page(ptepage);
+ return NULL;
+ }
return ptepage;
}
-void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+void __iomem *
+ioremap(phys_addr_t addr, unsigned long size)
{
-#ifdef CONFIG_SMP
- hash_page_sync();
-#endif
- free_page((unsigned long)pte);
+ return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED,
+ __builtin_return_address(0));
}
+EXPORT_SYMBOL(ioremap);
-void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+void __iomem *
+ioremap_wc(phys_addr_t addr, unsigned long size)
{
-#ifdef CONFIG_SMP
- hash_page_sync();
-#endif
- pgtable_page_dtor(ptepage);
- __free_page(ptepage);
+ return __ioremap_caller(addr, size, _PAGE_NO_CACHE,
+ __builtin_return_address(0));
}
+EXPORT_SYMBOL(ioremap_wc);
void __iomem *
-ioremap(phys_addr_t addr, unsigned long size)
+ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
{
- return __ioremap(addr, size, _PAGE_NO_CACHE);
+ /* writeable implies dirty for kernel addresses */
+ if (flags & _PAGE_RW)
+ flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
+
+ /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+ flags &= ~(_PAGE_USER | _PAGE_EXEC);
+
+#ifdef _PAGE_BAP_SR
+ /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
+ * which means that we just cleared supervisor access... oops ;-) This
+ * restores it
+ */
+ flags |= _PAGE_BAP_SR;
+#endif
+
+ return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}
-EXPORT_SYMBOL(ioremap);
+EXPORT_SYMBOL(ioremap_prot);
void __iomem *
-ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
+__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
{
- return __ioremap(addr, size, flags);
+ return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}
-EXPORT_SYMBOL(ioremap_flags);
void __iomem *
-__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
+__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
+ void *caller)
{
unsigned long v, i;
phys_addr_t p;
int err;
+ /* Make sure we have the base flags */
+ if ((flags & _PAGE_PRESENT) == 0)
+ flags |= PAGE_KERNEL;
+
+ /* Non-cacheable page cannot be coherent */
+ if (flags & _PAGE_NO_CACHE)
+ flags &= ~_PAGE_COHERENT;
+
/*
* Choose an address to map it to.
* Once the vmalloc system is running, we use it.
@@ -178,15 +204,18 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
if (p < 16*1024*1024)
p += _ISA_MEM_BASE;
+#ifndef CONFIG_CRASH_DUMP
/*
* Don't allow anybody to remap normal RAM that we're using.
* mem_init() sets high_memory so only do the check after that.
*/
- if (mem_init_done && (p < virt_to_phys(high_memory))) {
- printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n",
+ if (mem_init_done && (p < virt_to_phys(high_memory)) &&
+ !(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) {
+ printk("__ioremap(): phys addr 0x%llx is RAM lr %pf\n",
(unsigned long long)p, __builtin_return_address(0));
return NULL;
}
+#endif
if (size == 0)
return NULL;
@@ -210,19 +239,15 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
if (mem_init_done) {
struct vm_struct *area;
- area = get_vm_area(size, VM_IOREMAP);
+ area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (area == 0)
return NULL;
+ area->phys_addr = p;
v = (unsigned long) area->addr;
} else {
v = (ioremap_bot -= size);
}
- if ((flags & _PAGE_PRESENT) == 0)
- flags |= _PAGE_KERNEL;
- if (flags & _PAGE_NO_CACHE)
- flags |= _PAGE_GUARDED;
-
/*
* Should check if it is a candidate for a BAT mapping
*/
@@ -269,27 +294,30 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
/* The PTE should never be already set nor present in the
* hash table
*/
- BUG_ON(pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE));
+ BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) &&
+ flags);
set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
__pgprot(flags)));
}
+ smp_wmb();
return err;
}
/*
- * Map in all of physical memory starting at KERNELBASE.
+ * Map in a chunk of physical memory starting at start.
*/
-void __init mapin_ram(void)
+void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
{
- unsigned long v, p, s, f;
+ unsigned long v, s, f;
+ phys_addr_t p;
int ktext;
- s = mmu_mapin_ram();
- v = KERNELBASE + s;
- p = PPC_MEMSTART + s;
- for (; s < total_lowmem; s += PAGE_SIZE) {
+ s = offset;
+ v = PAGE_OFFSET + s;
+ p = memstart_addr + s;
+ for (; s < top; s += PAGE_SIZE) {
ktext = ((char *) v >= _stext && (char *) v < etext);
- f = ktext ?_PAGE_RAM_TEXT : _PAGE_RAM;
+ f = ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL;
map_page(v, p, f);
#ifdef CONFIG_PPC_STD_MMU_32
if (ktext)
@@ -300,6 +328,30 @@ void __init mapin_ram(void)
}
}
+void __init mapin_ram(void)
+{
+ unsigned long s, top;
+
+#ifndef CONFIG_WII
+ top = total_lowmem;
+ s = mmu_mapin_ram(top);
+ __mapin_ram_chunk(s, top);
+#else
+ if (!wii_hole_size) {
+ s = mmu_mapin_ram(total_lowmem);
+ __mapin_ram_chunk(s, total_lowmem);
+ } else {
+ top = wii_hole_start;
+ s = mmu_mapin_ram(top);
+ __mapin_ram_chunk(s, top);
+
+ top = memblock_end_of_DRAM();
+ s = wii_mmu_mapin_mem2(top);
+ __mapin_ram_chunk(s, top);
+ }
+#endif
+}
+
/* Scan the real Linux page tables and return a PTE pointer for
* a virtual address in a context.
* Returns true (1) if PTE was found, zero otherwise. The pointer to
@@ -349,9 +401,9 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
return 0;
if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
return -EINVAL;
- set_pte_at(&init_mm, address, kpte, mk_pte(page, prot));
+ __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
wmb();
- flush_HPTE(0, address, pmd_val(*kpmd));
+ flush_tlb_page(NULL, address);
pte_unmap(kpte);
return 0;
@@ -386,3 +438,23 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+static int fixmaps;
+
+void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+{
+ unsigned long address = __fix_to_virt(idx);
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+
+ map_page(address, phys, pgprot_val(flags));
+ fixmaps++;
+}
+
+void __this_fixmap_does_not_exist(void)
+{
+ WARN_ON(1);
+}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3ef0ad2f9ca..f6ce1f111f5 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -26,13 +26,16 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
+#include <linux/export.h>
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
-#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
@@ -47,27 +50,52 @@
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/sections.h>
-#include <asm/system.h>
-#include <asm/abs_addr.h>
#include <asm/firmware.h>
#include "mmu_decl.h"
+/* Some sanity checking */
+#if TASK_SIZE_USER64 > PGTABLE_RANGE
+#error TASK_SIZE_USER64 exceeds pagetable range
+#endif
+
+#ifdef CONFIG_PPC_STD_MMU_64
+#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
+#error TASK_SIZE_USER64 exceeds user VSID range
+#endif
+#endif
+
unsigned long ioremap_bot = IOREMAP_BASE;
+#ifdef CONFIG_PPC_MMU_NOHASH
+static void *early_alloc_pgtable(unsigned long size)
+{
+ void *pt;
+
+ if (init_bootmem_done)
+ pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS));
+ else
+ pt = __va(memblock_alloc_base(size, size,
+ __pa(MAX_DMA_ADDRESS)));
+ memset(pt, 0, size);
+
+ return pt;
+}
+#endif /* CONFIG_PPC_MMU_NOHASH */
+
/*
- * map_io_page currently only called by __ioremap
- * map_io_page adds an entry to the ioremap page table
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
* and adds an entry to the HPT, possibly bolting it
*/
-static int map_io_page(unsigned long ea, unsigned long pa, int flags)
+int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
{
pgd_t *pgdp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
- if (mem_init_done) {
+ if (slab_is_available()) {
pgdp = pgd_offset_k(ea);
pudp = pud_alloc(&init_mm, pgdp, ea);
if (!pudp)
@@ -81,6 +109,35 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
__pgprot(flags)));
} else {
+#ifdef CONFIG_PPC_MMU_NOHASH
+ /* Warning ! This will blow up if bootmem is not initialized
+ * which our ppc64 code is keen to do that, we'll need to
+ * fix it and/or be more careful
+ */
+ pgdp = pgd_offset_k(ea);
+#ifdef PUD_TABLE_SIZE
+ if (pgd_none(*pgdp)) {
+ pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+ BUG_ON(pudp == NULL);
+ pgd_populate(&init_mm, pgdp, pudp);
+ }
+#endif /* PUD_TABLE_SIZE */
+ pudp = pud_offset(pgdp, ea);
+ if (pud_none(*pudp)) {
+ pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+ BUG_ON(pmdp == NULL);
+ pud_populate(&init_mm, pudp, pmdp);
+ }
+ pmdp = pmd_offset(pudp, ea);
+ if (!pmd_present(*pmdp)) {
+ ptep = early_alloc_pgtable(PAGE_SIZE);
+ BUG_ON(ptep == NULL);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+ }
+ ptep = pte_offset_kernel(pmdp, ea);
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+ __pgprot(flags)));
+#else /* CONFIG_PPC_MMU_NOHASH */
/*
* If the mm subsystem is not fully up, we cannot create a
* linux page table entry for this mapping. Simply bolt an
@@ -93,7 +150,20 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
"memory at %016lx !\n", pa);
return -ENOMEM;
}
+#endif /* !CONFIG_PPC_MMU_NOHASH */
}
+
+#ifdef CONFIG_PPC_BOOK3E_64
+ /*
+ * With hardware tablewalk, a sync is needed to ensure that
+ * subsequent accesses see the PTE we just wrote. Unlike userspace
+ * mappings, we can't tolerate spurious faults, so make sure
+ * the new PTE will be seen the first time.
+ */
+ mb();
+#else
+ smp_wmb();
+#endif
return 0;
}
@@ -107,15 +177,24 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
{
unsigned long i;
+ /* Make sure we have the base flags */
if ((flags & _PAGE_PRESENT) == 0)
flags |= pgprot_val(PAGE_KERNEL);
+ /* Non-cacheable page cannot be coherent */
+ if (flags & _PAGE_NO_CACHE)
+ flags &= ~_PAGE_COHERENT;
+
+ /* We don't support the 4K PFN hack with ioremap */
+ if (flags & _PAGE_4K_PFN)
+ return NULL;
+
WARN_ON(pa & ~PAGE_MASK);
WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
WARN_ON(size & ~PAGE_MASK);
for (i = 0; i < size; i += PAGE_SIZE)
- if (map_io_page((unsigned long)ea+i, pa+i, flags))
+ if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
return NULL;
return (void __iomem *)ea;
@@ -135,8 +214,8 @@ void __iounmap_at(void *ea, unsigned long size)
unmap_kernel_range((unsigned long)ea, size);
}
-void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
- unsigned long flags)
+void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
+ unsigned long flags, void *caller)
{
phys_addr_t paligned;
void __iomem *ret;
@@ -159,10 +238,13 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
if (mem_init_done) {
struct vm_struct *area;
- area = __get_vm_area(size, VM_IOREMAP,
- ioremap_bot, IOREMAP_END);
+ area = __get_vm_area_caller(size, VM_IOREMAP,
+ ioremap_bot, IOREMAP_END,
+ caller);
if (area == NULL)
return NULL;
+
+ area->phys_addr = paligned;
ret = __ioremap_at(paligned, area->addr, size, flags);
if (!ret)
vunmap(area->addr);
@@ -177,22 +259,55 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
return ret;
}
+void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
+ unsigned long flags)
+{
+ return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
+}
void __iomem * ioremap(phys_addr_t addr, unsigned long size)
{
unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
+ void *caller = __builtin_return_address(0);
if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags);
- return __ioremap(addr, size, flags);
+ return ppc_md.ioremap(addr, size, flags, caller);
+ return __ioremap_caller(addr, size, flags, caller);
}
-void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size,
+void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
+{
+ unsigned long flags = _PAGE_NO_CACHE;
+ void *caller = __builtin_return_address(0);
+
+ if (ppc_md.ioremap)
+ return ppc_md.ioremap(addr, size, flags, caller);
+ return __ioremap_caller(addr, size, flags, caller);
+}
+
+void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
unsigned long flags)
{
+ void *caller = __builtin_return_address(0);
+
+ /* writeable implies dirty for kernel addresses */
+ if (flags & _PAGE_RW)
+ flags |= _PAGE_DIRTY;
+
+ /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+ flags &= ~(_PAGE_USER | _PAGE_EXEC);
+
+#ifdef _PAGE_BAP_SR
+ /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
+ * which means that we just cleared supervisor access... oops ;-) This
+ * restores it
+ */
+ flags |= _PAGE_BAP_SR;
+#endif
+
if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags);
- return __ioremap(addr, size, flags);
+ return ppc_md.ioremap(addr, size, flags, caller);
+ return __ioremap_caller(addr, size, flags, caller);
}
@@ -226,9 +341,550 @@ void iounmap(volatile void __iomem *token)
}
EXPORT_SYMBOL(ioremap);
-EXPORT_SYMBOL(ioremap_flags);
+EXPORT_SYMBOL(ioremap_wc);
+EXPORT_SYMBOL(ioremap_prot);
EXPORT_SYMBOL(__ioremap);
EXPORT_SYMBOL(__ioremap_at);
EXPORT_SYMBOL(iounmap);
EXPORT_SYMBOL(__iounmap);
EXPORT_SYMBOL(__iounmap_at);
+
+/*
+ * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
+ * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
+ */
+struct page *pmd_page(pmd_t pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(pmd))
+ return pfn_to_page(pmd_pfn(pmd));
+#endif
+ return virt_to_page(pmd_page_vaddr(pmd));
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+static pte_t *get_from_cache(struct mm_struct *mm)
+{
+ void *pte_frag, *ret;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pte_frag;
+ if (ret) {
+ pte_frag = ret + PTE_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
+ pte_frag = NULL;
+ mm->context.pte_frag = pte_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pte_t *)ret;
+}
+
+static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
+{
+ void *ret = NULL;
+ struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
+ __GFP_REPEAT | __GFP_ZERO);
+ if (!page)
+ return NULL;
+ if (!kernel && !pgtable_page_ctor(page)) {
+ __free_page(page);
+ return NULL;
+ }
+
+ ret = page_address(page);
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!mm->context.pte_frag)) {
+ atomic_set(&page->_count, PTE_FRAG_NR);
+ mm->context.pte_frag = ret + PTE_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pte_t *)ret;
+}
+
+pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
+{
+ pte_t *pte;
+
+ pte = get_from_cache(mm);
+ if (pte)
+ return pte;
+
+ return __alloc_for_cache(mm, kernel);
+}
+
+void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
+{
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ if (!kernel)
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+#ifdef CONFIG_SMP
+static void page_table_free_rcu(void *table)
+{
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+ unsigned long pgf = (unsigned long)table;
+
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ pgf |= shift;
+ tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+ void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+ unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+ if (!shift)
+ /* PTE page needs special handling */
+ page_table_free_rcu(table);
+ else {
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ kmem_cache_free(PGT_CACHE(shift), table);
+ }
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+ if (!shift) {
+ /* PTE page needs special handling */
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+ } else {
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ kmem_cache_free(PGT_CACHE(shift), table);
+ }
+}
+#endif
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp, pmd_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+ changed = !pmd_same(*(pmdp), entry);
+ if (changed) {
+ __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+ /*
+ * Since we are not supporting SW TLB systems, we don't
+ * have any thing similar to flush_tlb_page_nohash()
+ */
+ }
+ return changed;
+}
+
+unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set)
+{
+
+ unsigned long old, tmp;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ andi. %1,%0,%6\n\
+ bne- 1b \n\
+ andc %1,%0,%4 \n\
+ or %1,%1,%7\n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
+ : "cc" );
+#else
+ old = pmd_val(*pmdp);
+ *pmdp = __pmd((old & ~clr) | set);
+#endif
+ if (old & _PAGE_HASHPTE)
+ hpte_do_hugepage_flush(mm, addr, pmdp);
+ return old;
+}
+
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ if (pmd_trans_huge(*pmdp)) {
+ pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ } else {
+ /*
+ * khugepaged calls this for normal pmd
+ */
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+ /*
+ * Wait for all pending hash_page to finish. This is needed
+ * in case of subpage collapse. When we collapse normal pages
+ * to hugepage, we first clear the pmd, then invalidate all
+ * the PTE entries. The assumption here is that any low level
+ * page fault will see a none pmd and take the slow path that
+ * will wait on mmap_sem. But we could very well be in a
+ * hash_page with local ptep pointer value. Such a hash page
+ * can result in adding new HPTE entries for normal subpages.
+ * That means we could be modifying the page content as we
+ * copy them to a huge page. So wait for parallel hash_page
+ * to finish before invalidating HPTE entries. We can do this
+ * by sending an IPI to all the cpus and executing a dummy
+ * function there.
+ */
+ kick_all_cpus_sync();
+ /*
+ * Now invalidate the hpte entries in the range
+ * covered by pmd. This make sure we take a
+ * fault and will find the pmd as none, which will
+ * result in a major fault which takes mmap_sem and
+ * hence wait for collapse to complete. Without this
+ * the __collapse_huge_page_copy can result in copying
+ * the old content.
+ */
+ flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+ }
+ return pmd;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We mark the pmd splitting and invalidate all the hpte
+ * entries for this hugepage.
+ */
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ unsigned long old, tmp;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ andi. %1,%0,%6\n\
+ bne- 1b \n\
+ ori %1,%0,%4 \n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
+ : "cc" );
+#else
+ old = pmd_val(*pmdp);
+ *pmdp = __pmd(old | _PAGE_SPLITTING);
+#endif
+ /*
+ * If we didn't had the splitting flag set, go and flush the
+ * HPTE entries.
+ */
+ if (!(old & _PAGE_SPLITTING)) {
+ /* We need to flush the hpte */
+ if (old & _PAGE_HASHPTE)
+ hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+ }
+ /*
+ * This ensures that generic code that rely on IRQ disabling
+ * to prevent a parallel THP split work as expected.
+ */
+ kick_all_cpus_sync();
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pgtable_t *pgtable_slot;
+ assert_spin_locked(&mm->page_table_lock);
+ /*
+ * we store the pgtable in the second half of PMD
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ *pgtable_slot = pgtable;
+ /*
+ * expose the deposited pgtable to other cpus.
+ * before we set the hugepage PTE at pmd level
+ * hash fault code looks at the deposted pgtable
+ * to store hash index values.
+ */
+ smp_wmb();
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pgtable_t pgtable;
+ pgtable_t *pgtable_slot;
+
+ assert_spin_locked(&mm->page_table_lock);
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Once we withdraw, mark the entry NULL.
+ */
+ *pgtable_slot = NULL;
+ /*
+ * We store HPTE information in the deposited PTE fragment.
+ * zero out the content on withdraw.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return pgtable;
+}
+
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
+ assert_spin_locked(&mm->page_table_lock);
+ WARN_ON(!pmd_trans_huge(pmd));
+#endif
+ return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ int ssize, i;
+ unsigned long s_addr;
+ int max_hpte_count;
+ unsigned int psize, valid;
+ unsigned char *hpte_slot_array;
+ unsigned long hidx, vpn, vsid, hash, shift, slot;
+
+ /*
+ * Flush all the hptes mapping this hugepage
+ */
+ s_addr = addr & HPAGE_PMD_MASK;
+ hpte_slot_array = get_hpte_slot_array(pmdp);
+ /*
+ * IF we try to do a HUGE PTE update after a withdraw is done.
+ * we will find the below NULL. This happens when we do
+ * split_huge_page_pmd
+ */
+ if (!hpte_slot_array)
+ return;
+
+ /* get the base page size */
+ psize = get_slice_psize(mm, s_addr);
+
+ if (ppc_md.hugepage_invalidate)
+ return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
+ s_addr, psize);
+ /*
+ * No bluk hpte removal support, invalidate each entry
+ */
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = HPAGE_PMD_SIZE >> shift;
+ for (i = 0; i < max_hpte_count; i++) {
+ /*
+ * 8 bits per each hpte entries
+ * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+ */
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_vsid(mm->context.id, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ ppc_md.hpte_invalidate(slot, vpn, psize,
+ MMU_PAGE_16M, ssize, 0);
+ }
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+ pmd_val(pmd) |= pgprot_val(pgprot);
+ return pmd;
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+ pmd_t pmd;
+ /*
+ * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
+ * set. We use this to check THP page at pmd level.
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
+ pmd_val(pmd) |= _PAGE_THP_HUGE;
+ pmd = pmd_set_protbits(pmd, pgprot);
+ return pmd;
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+ return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+
+ pmd_val(pmd) &= _HPAGE_CHG_MASK;
+ pmd = pmd_set_protbits(pmd, newprot);
+ return pmd;
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd)
+{
+ return;
+}
+
+pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old_pmd;
+ pgtable_t pgtable;
+ unsigned long old;
+ pgtable_t *pgtable_slot;
+
+ old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+ old_pmd = __pmd(old);
+ /*
+ * We have pmd == none and we are holding page_table_lock.
+ * So we can safely go and clear the pgtable hash
+ * index info.
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Let's zero out old valid and hash index details
+ * hash fault look at them.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return old_pmd;
+}
+
+int has_transparent_hugepage(void)
+{
+ if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+ return 0;
+ /*
+ * We support THP only if PMD_SIZE is 16MB.
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+ return 0;
+ /*
+ * We need to make sure that we support 16MB hugepage in a segement
+ * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+ * of 64K.
+ */
+ /*
+ * If we have 64K HPTE, we will be using that by default
+ */
+ if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+ (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+ return 0;
+ /*
+ * Ok we only have 4K HPTE
+ */
+ if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+ return 0;
+
+ return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 5c45d474cfc..11571e11883 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -26,11 +26,11 @@
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/memblock.h>
#include <asm/prom.h>
#include <asm/mmu.h>
#include <asm/machdep.h>
-#include <asm/lmb.h>
#include "mmu_decl.h"
@@ -38,21 +38,18 @@ struct hash_pte *Hash, *Hash_end;
unsigned long Hash_size, Hash_mask;
unsigned long _SDR1;
-union ubat { /* BAT register values to be loaded */
- struct ppc_bat bat;
- u32 word[2];
-} BATS[8][2]; /* 8 pairs of IBAT, DBAT */
+struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */
struct batrange { /* stores address ranges mapped by BATs */
unsigned long start;
unsigned long limit;
- unsigned long phys;
+ phys_addr_t phys;
} bat_addrs[8];
/*
* Return PA for this VA if it is mapped by a BAT, or 0
*/
-unsigned long v_mapped_by_bats(unsigned long va)
+phys_addr_t v_mapped_by_bats(unsigned long va)
{
int b;
for (b = 0; b < 4; ++b)
@@ -64,7 +61,7 @@ unsigned long v_mapped_by_bats(unsigned long va)
/*
* Return VA for a given PA or 0 if not mapped
*/
-unsigned long p_mapped_by_bats(unsigned long pa)
+unsigned long p_mapped_by_bats(phys_addr_t pa)
{
int b;
for (b = 0; b < 4; ++b)
@@ -75,14 +72,10 @@ unsigned long p_mapped_by_bats(unsigned long pa)
return 0;
}
-unsigned long __init mmu_mapin_ram(void)
+unsigned long __init mmu_mapin_ram(unsigned long top)
{
-#ifdef CONFIG_POWER4
- return 0;
-#else
unsigned long tot, bl, done;
unsigned long max_size = (256<<20);
- unsigned long align;
if (__map_without_bats) {
printk(KERN_DEBUG "RAM mapped without BATs\n");
@@ -93,32 +86,25 @@ unsigned long __init mmu_mapin_ram(void)
/* Make sure we don't map a block larger than the
smallest alignment of the physical address. */
- /* alignment of PPC_MEMSTART */
- align = ~(PPC_MEMSTART-1) & PPC_MEMSTART;
- /* set BAT block size to MIN(max_size, align) */
- if (align && align < max_size)
- max_size = align;
-
- tot = total_lowmem;
+ tot = top;
for (bl = 128<<10; bl < max_size; bl <<= 1) {
if (bl * 2 > tot)
break;
}
- setbat(2, KERNELBASE, PPC_MEMSTART, bl, _PAGE_RAM);
- done = (unsigned long)bat_addrs[2].limit - KERNELBASE + 1;
+ setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X);
+ done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1;
if ((done < tot) && !bat_addrs[3].limit) {
/* use BAT3 to cover a bit more */
tot -= done;
for (bl = 128<<10; bl < max_size; bl <<= 1)
if (bl * 2 > tot)
break;
- setbat(3, KERNELBASE+done, PPC_MEMSTART+done, bl, _PAGE_RAM);
- done = (unsigned long)bat_addrs[3].limit - KERNELBASE + 1;
+ setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X);
+ done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1;
}
return done;
-#endif
}
/*
@@ -126,16 +112,16 @@ unsigned long __init mmu_mapin_ram(void)
* The parameters are not checked; in particular size must be a power
* of 2 between 128k and 256M.
*/
-void __init setbat(int index, unsigned long virt, unsigned long phys,
+void __init setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, int flags)
{
unsigned int bl;
int wimgxpp;
- union ubat *bat = BATS[index];
+ struct ppc_bat *bat = BATS[index];
- if (((flags & _PAGE_NO_CACHE) == 0) &&
- cpu_has_feature(CPU_FTR_NEED_COHERENT))
- flags |= _PAGE_COHERENT;
+ if ((flags & _PAGE_NO_CACHE) ||
+ (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
+ flags &= ~_PAGE_COHERENT;
bl = (size >> 17) - 1;
if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
@@ -144,15 +130,13 @@ void __init setbat(int index, unsigned long virt, unsigned long phys,
wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
| _PAGE_COHERENT | _PAGE_GUARDED);
wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
- bat[1].word[0] = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
- bat[1].word[1] = phys | wimgxpp;
-#ifndef CONFIG_KGDB /* want user access for breakpoints */
+ bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+ bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
if (flags & _PAGE_USER)
-#endif
- bat[1].bat.batu.vp = 1;
+ bat[1].batu |= 1; /* Vp = 1 */
if (flags & _PAGE_GUARDED) {
/* G bit must be zero in IBATs */
- bat[0].word[0] = bat[0].word[1] = 0;
+ bat[0].batu = bat[0].batl = 0;
} else {
/* make IBAT same as DBAT */
bat[0] = bat[1];
@@ -165,8 +149,8 @@ void __init setbat(int index, unsigned long virt, unsigned long phys,
| _PAGE_COHERENT);
wimgxpp |= (flags & _PAGE_RW)?
((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
- bat->word[0] = virt | wimgxpp | 4; /* Ks=0, Ku=1 */
- bat->word[1] = phys | bl | 0x40; /* V=1 */
+ bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */
+ bat->batl = phys | bl | 0x40; /* V=1 */
}
bat_addrs[index].start = virt;
@@ -202,7 +186,7 @@ void __init MMU_init_hw(void)
extern unsigned int hash_page[];
extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
- if (!cpu_has_feature(CPU_FTR_HPTE_TABLE)) {
+ if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
/*
* Put a blr (procedure return) instruction at the
* start of hash_page, since we can still get DSI
@@ -239,15 +223,14 @@ void __init MMU_init_hw(void)
* Find some memory for the hash table.
*/
if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
- Hash = __va(lmb_alloc_base(Hash_size, Hash_size,
- __initial_memory_limit));
+ Hash = __va(memblock_alloc(Hash_size, Hash_size));
cacheable_memzero(Hash, Hash_size);
_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
- printk("Total memory = %ldMB; using %ldkB for hash table (at %p)\n",
- total_memory >> 20, Hash_size >> 10, Hash);
+ printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
+ (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
/*
@@ -288,3 +271,18 @@ void __init MMU_init_hw(void)
if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ /* 601 can only access 16MB at the moment */
+ if (PVR_VER(mfspr(SPRN_PVR)) == 1)
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
+ else /* Anything else has 256M mapped */
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
+}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 906daeda59a..0399a670295 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -2,7 +2,7 @@
* PowerPC64 SLB support.
*
* Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- * Based on earlier code writteh by:
+ * Based on earlier code written by:
* Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
* Copyright (c) 2001 Dave Engebretsen
* Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
@@ -14,8 +14,6 @@
* 2 of the License, or (at your option) any later version.
*/
-#undef DEBUG
-
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
@@ -23,15 +21,10 @@
#include <asm/cputable.h>
#include <asm/cacheflush.h>
#include <asm/smp.h>
-#include <asm/firmware.h>
#include <linux/compiler.h>
#include <asm/udbg.h>
+#include <asm/code-patching.h>
-#ifdef DEBUG
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
extern void slb_allocate_realmode(unsigned long ea);
extern void slb_allocate_user(unsigned long ea);
@@ -44,13 +37,13 @@ static void slb_allocate(unsigned long ea)
slb_allocate_realmode(ea);
}
+#define slb_esid_mask(ssize) \
+ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
+
static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
unsigned long slot)
{
- unsigned long mask;
-
- mask = (ssize == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T;
- return (ea & mask) | SLB_ESID_V | slot;
+ return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
}
#define slb_vsid_shift(ssize) \
@@ -73,8 +66,10 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
* we only update the current CPU's SLB shadow buffer.
*/
get_slb_shadow()->save_area[entry].esid = 0;
- get_slb_shadow()->save_area[entry].vsid = mk_vsid_data(ea, ssize, flags);
- get_slb_shadow()->save_area[entry].esid = mk_esid_data(ea, ssize, entry);
+ get_slb_shadow()->save_area[entry].vsid =
+ cpu_to_be64(mk_vsid_data(ea, ssize, flags));
+ get_slb_shadow()->save_area[entry].esid =
+ cpu_to_be64(mk_esid_data(ea, ssize, entry));
}
static inline void slb_shadow_clear(unsigned long entry)
@@ -99,15 +94,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
: "memory" );
}
-void slb_flush_and_rebolt(void)
+static void __slb_flush_and_rebolt(void)
{
/* If you change this make sure you change SLB_NUM_BOLTED
- * appropriately too. */
+ * and PR KVM appropriately too. */
unsigned long linear_llp, vmalloc_llp, lflags, vflags;
unsigned long ksp_esid_data, ksp_vsid_data;
- WARN_ON(!irqs_disabled());
-
linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
lflags = SLB_VSID_KERNEL | linear_llp;
@@ -121,15 +114,10 @@ void slb_flush_and_rebolt(void)
} else {
/* Update stack entry; others don't change */
slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2);
- ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
+ ksp_vsid_data =
+ be64_to_cpu(get_slb_shadow()->save_area[2].vsid);
}
- /*
- * We can't take a PMU exception in the following code, so hard
- * disable interrupts.
- */
- hard_irq_disable();
-
/* We need to do this all in asm, so we're sure we don't touch
* the stack between the slbia and rebolting it. */
asm volatile("isync\n"
@@ -146,6 +134,21 @@ void slb_flush_and_rebolt(void)
: "memory");
}
+void slb_flush_and_rebolt(void)
+{
+
+ WARN_ON(!irqs_disabled());
+
+ /*
+ * We can't take a PMU exception in the following code, so hard
+ * disable interrupts.
+ */
+ hard_irq_disable();
+
+ __slb_flush_and_rebolt();
+ get_paca()->slb_cache_ptr = 0;
+}
+
void slb_vmalloc_update(void)
{
unsigned long vflags;
@@ -166,7 +169,7 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
int esid_1t_count;
/* System is not 1T segment size capable. */
- if (!cpu_has_feature(CPU_FTR_1T_SEGMENT))
+ if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
return (GET_ESID(addr1) == GET_ESID(addr2));
esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) +
@@ -187,13 +190,21 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
/* Flush all user entries from the segment table of the current processor. */
void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
{
- unsigned long offset = get_paca()->slb_cache_ptr;
+ unsigned long offset;
unsigned long slbie_data = 0;
unsigned long pc = KSTK_EIP(tsk);
unsigned long stack = KSTK_ESP(tsk);
- unsigned long unmapped_base;
+ unsigned long exec_base;
- if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
+ /*
+ * We need interrupts hard-disabled here, not just soft-disabled,
+ * so that a PMU interrupt can't occur, which might try to access
+ * user memory (to get a stack trace) and possible cause an SLB miss
+ * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+ */
+ hard_irq_disable();
+ offset = get_paca()->slb_cache_ptr;
+ if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
offset <= SLB_CACHE_ENTRIES) {
int i;
asm volatile("isync" : : : "memory");
@@ -207,7 +218,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
}
asm volatile("isync" : : : "memory");
} else {
- slb_flush_and_rebolt();
+ __slb_flush_and_rebolt();
}
/* Workaround POWER5 < DD2.1 issue */
@@ -219,40 +230,45 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
/*
* preload some userspace segments into the SLB.
+ * Almost all 32 and 64bit PowerPC executables are linked at
+ * 0x10000000 so it makes sense to preload this segment.
*/
- if (test_tsk_thread_flag(tsk, TIF_32BIT))
- unmapped_base = TASK_UNMAPPED_BASE_USER32;
- else
- unmapped_base = TASK_UNMAPPED_BASE_USER64;
-
- if (is_kernel_addr(pc))
- return;
- slb_allocate(pc);
+ exec_base = 0x10000000;
- if (esids_match(pc,stack))
+ if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
+ is_kernel_addr(exec_base))
return;
- if (is_kernel_addr(stack))
- return;
- slb_allocate(stack);
+ slb_allocate(pc);
- if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base))
- return;
+ if (!esids_match(pc, stack))
+ slb_allocate(stack);
- if (is_kernel_addr(unmapped_base))
- return;
- slb_allocate(unmapped_base);
+ if (!esids_match(pc, exec_base) &&
+ !esids_match(stack, exec_base))
+ slb_allocate(exec_base);
}
static inline void patch_slb_encoding(unsigned int *insn_addr,
unsigned int immed)
{
- /* Assume the instruction had a "0" immediate value, just
- * "or" in the new value
- */
- *insn_addr |= immed;
- flush_icache_range((unsigned long)insn_addr, 4+
- (unsigned long)insn_addr);
+ int insn = (*insn_addr & 0xffff0000) | immed;
+ patch_instruction(insn_addr, insn);
+}
+
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_linear[];
+extern u32 slb_miss_kernel_load_io[];
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_vmemmap[];
+
+void slb_set_size(u16 size)
+{
+ if (mmu_slb_size == size)
+ return;
+
+ mmu_slb_size = size;
+ patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size);
}
void slb_initialize(void)
@@ -260,16 +276,18 @@ void slb_initialize(void)
unsigned long linear_llp, vmalloc_llp, io_llp;
unsigned long lflags, vflags;
static int slb_encoding_inited;
- extern unsigned int *slb_miss_kernel_load_linear;
- extern unsigned int *slb_miss_kernel_load_io;
- extern unsigned int *slb_compare_rr_to_size;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ unsigned long vmemmap_llp;
+#endif
/* Prepare our SLB miss handler based on our page size */
linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
io_llp = mmu_psize_defs[mmu_io_psize].sllp;
vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
-
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
if (!slb_encoding_inited) {
slb_encoding_inited = 1;
patch_slb_encoding(slb_miss_kernel_load_linear,
@@ -279,17 +297,18 @@ void slb_initialize(void)
patch_slb_encoding(slb_compare_rr_to_size,
mmu_slb_size);
- DBG("SLB: linear LLP = %04x\n", linear_llp);
- DBG("SLB: io LLP = %04x\n", io_llp);
+ pr_devel("SLB: linear LLP = %04lx\n", linear_llp);
+ pr_devel("SLB: io LLP = %04lx\n", io_llp);
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ patch_slb_encoding(slb_miss_kernel_load_vmemmap,
+ SLB_VSID_KERNEL | vmemmap_llp);
+ pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
+#endif
}
get_paca()->stab_rr = SLB_NUM_BOLTED;
- /* On iSeries the bolted entries have already been set up by
- * the hypervisor from the lparMap data in head.S */
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- return;
-
lflags = SLB_VSID_KERNEL | linear_llp;
vflags = SLB_VSID_KERNEL | vmalloc_llp;
@@ -301,11 +320,16 @@ void slb_initialize(void)
create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
+ /* For the boot cpu, we're running on the stack in init_thread_union,
+ * which is in the first segment of the linear mapping, and also
+ * get_paca()->kstack hasn't been initialized yet.
+ * For secondary cpus, we need to bolt the kernel stack entry now.
+ */
slb_shadow_clear(2);
+ if (raw_smp_processor_id() != boot_cpuid &&
+ (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
+ create_shadowed_slbe(get_paca()->kstack,
+ mmu_kernel_ssize, lflags, 2);
- /* We don't bolt the stack for the time being - we're in boot,
- * so the stack is in the bolted segment. By the time it goes
- * elsewhere, we'll call _switch() which will bolt in the new
- * one. */
asm volatile("isync":::"memory");
}
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 657f6b37e9d..736d18b3cef 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -31,10 +31,15 @@
* No other registers are examined or changed.
*/
_GLOBAL(slb_allocate_realmode)
- /* r3 = faulting address */
+ /*
+ * check for bad kernel/user address
+ * (ea & ~REGION_MASK) >= PGTABLE_RANGE
+ */
+ rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4)
+ bne- 8f
srdi r9,r3,60 /* get region */
- srdi r10,r3,28 /* get esid */
+ srdi r10,r3,SID_SHIFT /* get esid */
cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */
/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
@@ -47,48 +52,68 @@ _GLOBAL(slb_allocate_realmode)
* it to VSID 0, which is reserved as a bad VSID - one which
* will never have any pages in it. */
- /* Check if hitting the linear mapping of the vmalloc/ioremap
- * kernel space
+ /* Check if hitting the linear mapping or some other kernel space
*/
bne cr7,1f
/* Linear mapping encoding bits, the "li" instruction below will
* be patched by the kernel at boot
*/
-_GLOBAL(slb_miss_kernel_load_linear)
+.globl slb_miss_kernel_load_linear
+slb_miss_kernel_load_linear:
li r11,0
+ /*
+ * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * r9 = region id.
+ */
+ addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
+
BEGIN_FTR_SECTION
b slb_finish_load
-END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
b slb_finish_load_1T
-1: /* vmalloc/ioremap mapping encoding bits, the "li" instructions below
- * will be patched by the kernel at boot
+1:
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ /* Check virtual memmap region. To be patches at kernel boot */
+ cmpldi cr0,r9,0xf
+ bne 1f
+.globl slb_miss_kernel_load_vmemmap
+slb_miss_kernel_load_vmemmap:
+ li r11,0
+ b 6f
+1:
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+ /* vmalloc mapping gets the encoding from the PACA as the mapping
+ * can be demoted from 64K -> 4K dynamically on some machines
*/
-BEGIN_FTR_SECTION
- /* check whether this is in vmalloc or ioremap space */
clrldi r11,r10,48
cmpldi r11,(VMALLOC_SIZE >> 28) - 1
bgt 5f
lhz r11,PACAVMALLOCSLLP(r13)
b 6f
5:
-END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
-_GLOBAL(slb_miss_kernel_load_io)
+ /* IO mapping */
+.globl slb_miss_kernel_load_io
+slb_miss_kernel_load_io:
li r11,0
6:
+ /*
+ * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * r9 = region id.
+ */
+ addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
BEGIN_FTR_SECTION
b slb_finish_load
-END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
b slb_finish_load_1T
-0: /* user address: proto-VSID = context << 15 | ESID. First check
- * if the address is within the boundaries of the user region
- */
- srdi. r9,r10,USER_ESID_BITS
- bne- 8f /* invalid ea bits set */
-
-
+0:
/* when using slices, we extract the psize off the slice bitmaps
* and then we need to get the sllp encoding off the mmu_psize_defs
* array.
@@ -100,17 +125,31 @@ END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
* between 4k and 64k standard page size
*/
#ifdef CONFIG_PPC_MM_SLICES
+ /* r10 have esid */
cmpldi r10,16
-
- /* Get the slice index * 4 in r11 and matching slice size mask in r9 */
- ld r9,PACALOWSLICESPSIZE(r13)
- sldi r11,r10,2
+ /* below SLICE_LOW_TOP */
blt 5f
- ld r9,PACAHIGHSLICEPSIZE(r13)
- srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2)
- andi. r11,r11,0x3c
+ /*
+ * Handle hpsizes,
+ * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index
+ */
+ srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */
+ addi r9,r11,PACAHIGHSLICEPSIZE
+ lbzx r9,r13,r9 /* r9 is hpsizes[r11] */
+ /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */
+ rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63
+ b 6f
-5: /* Extract the psize and multiply to get an array offset */
+5:
+ /*
+ * Handle lpsizes
+ * r9 is get_paca()->context.low_slices_psize, r11 is index
+ */
+ ld r9,PACALOWSLICESPSIZE(r13)
+ mr r11,r10
+6:
+ sldi r11,r11,2 /* index * 4 */
+ /* Extract the psize and multiply to get an array offset */
srd r9,r9,r11
andi. r9,r9,0xf
mulli r9,r9,MMUPSIZEDEFSIZE
@@ -130,15 +169,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
ld r9,PACACONTEXTID(r13)
BEGIN_FTR_SECTION
cmpldi r10,0x1000
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
- rldimi r10,r9,USER_ESID_BITS,0
-BEGIN_FTR_SECTION
bge slb_finish_load_1T
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
b slb_finish_load
8: /* invalid EA */
li r10,0 /* BAD_VSID */
+ li r9,0 /* BAD_VSID */
li r11,SLB_VSID_USER /* flags don't much matter */
b slb_finish_load
@@ -187,8 +224,6 @@ _GLOBAL(slb_allocate_user)
/* get context to calculate proto-VSID */
ld r9,PACACONTEXTID(r13)
- rldimi r10,r9,USER_ESID_BITS,0
-
/* fall through slb_finish_load */
#endif /* __DISABLED__ */
@@ -197,11 +232,16 @@ _GLOBAL(slb_allocate_user)
/*
* Finish loading of an SLB entry and return
*
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
+ * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
*/
slb_finish_load:
+ rldimi r10,r9,ESID_BITS,0
ASM_VSID_SCRAMBLE(r10,r9,256M)
- rldimi r11,r10,SLB_VSID_SHIFT,16 /* combine VSID and flags */
+ /*
+ * bits above VSID_BITS_256M need to be ignored from r10
+ * also combine VSID and flags
+ */
+ rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M))
/* r3 = EA, r11 = VSID data */
/*
@@ -209,26 +249,12 @@ slb_finish_load:
* free slot first but that took too long. Unfortunately we
* dont have any LRU information to help us choose a slot.
*/
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
- /*
- * On iSeries, the "bolted" stack segment can be cast out on
- * shared processor switch so we need to check for a miss on
- * it and restore it to the right slot.
- */
- ld r9,PACAKSAVE(r13)
- clrrdi r9,r9,28
- clrrdi r3,r3,28
- li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */
- cmpld r9,r3
- beq 3f
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
7: ld r10,PACASTABRR(r13)
addi r10,r10,1
/* This gets soft patched on boot. */
-_GLOBAL(slb_compare_rr_to_size)
+.globl slb_compare_rr_to_size
+slb_compare_rr_to_size:
cmpldi r10,0
blt+ 4f
@@ -259,10 +285,10 @@ _GLOBAL(slb_compare_rr_to_size)
bge 1f
/* still room in the slb cache */
- sldi r11,r3,1 /* r11 = offset * sizeof(u16) */
- rldicl r10,r10,36,28 /* get low 16 bits of the ESID */
- add r11,r11,r13 /* r11 = (u16 *)paca + offset */
- sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
+ sldi r11,r3,2 /* r11 = offset * sizeof(u32) */
+ srdi r10,r10,28 /* get the 36 bits of the ESID */
+ add r11,r11,r13 /* r11 = (u32 *)paca + offset */
+ stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
addi r3,r3,1 /* offset++ */
b 2f
1: /* offset >= SLB_CACHE_ENTRIES */
@@ -274,14 +300,18 @@ _GLOBAL(slb_compare_rr_to_size)
/*
* Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
- * We assume legacy iSeries will never have 1T segments.
*
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9
+ * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9
*/
slb_finish_load_1T:
- srdi r10,r10,40-28 /* get 1T ESID */
+ srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
+ rldimi r10,r9,ESID_BITS_1T,0
ASM_VSID_SCRAMBLE(r10,r9,1T)
- rldimi r11,r10,SLB_VSID_SHIFT_1T,16 /* combine VSID and flags */
+ /*
+ * bits above VSID_BITS_1T need to be ignored from r10
+ * also combine VSID and flags
+ */
+ rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T))
li r10,MMU_SEGSIZE_1T
rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index ad928edafb0..b0c75cc15ef 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -29,11 +29,16 @@
#include <linux/pagemap.h>
#include <linux/err.h>
#include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <asm/mman.h>
#include <asm/mmu.h>
#include <asm/spu.h>
+/* some sanity checks */
+#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
+#error PGTABLE_RANGE exceeds slice_mask high_slices size
+#endif
+
static DEFINE_SPINLOCK(slice_convert_lock);
@@ -42,7 +47,7 @@ int _slice_debug = 1;
static void slice_print_mask(const char *label, struct slice_mask mask)
{
- char *p, buf[16 + 3 + 16 + 1];
+ char *p, buf[16 + 3 + 64 + 1];
int i;
if (!_slice_debug)
@@ -54,7 +59,7 @@ static void slice_print_mask(const char *label, struct slice_mask mask)
*(p++) = '-';
*(p++) = ' ';
for (i = 0; i < SLICE_NUM_HIGH; i++)
- *(p++) = (mask.high_slices & (1 << i)) ? '1' : '0';
+ *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
*(p++) = 0;
printk(KERN_DEBUG "%s:%s\n", label, buf);
@@ -84,8 +89,8 @@ static struct slice_mask slice_range_to_mask(unsigned long start,
}
if ((start + len) > SLICE_LOW_TOP)
- ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1))
- - (1u << GET_HIGH_SLICE_INDEX(start));
+ ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
+ - (1ul << GET_HIGH_SLICE_INDEX(start));
return ret;
}
@@ -135,26 +140,31 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
for (i = 0; i < SLICE_NUM_HIGH; i++)
if (!slice_high_has_vma(mm, i))
- ret.high_slices |= 1u << i;
+ ret.high_slices |= 1ul << i;
return ret;
}
static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
{
+ unsigned char *hpsizes;
+ int index, mask_index;
struct slice_mask ret = { 0, 0 };
unsigned long i;
- u64 psizes;
+ u64 lpsizes;
- psizes = mm->context.low_slices_psize;
+ lpsizes = mm->context.low_slices_psize;
for (i = 0; i < SLICE_NUM_LOW; i++)
- if (((psizes >> (i * 4)) & 0xf) == psize)
+ if (((lpsizes >> (i * 4)) & 0xf) == psize)
ret.low_slices |= 1u << i;
- psizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++)
- if (((psizes >> (i * 4)) & 0xf) == psize)
- ret.high_slices |= 1u << i;
+ hpsizes = mm->context.high_slices_psize;
+ for (i = 0; i < SLICE_NUM_HIGH; i++) {
+ mask_index = i & 0x1;
+ index = i >> 1;
+ if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
+ ret.high_slices |= 1ul << i;
+ }
return ret;
}
@@ -183,8 +193,10 @@ static void slice_flush_segments(void *parm)
static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
{
+ int index, mask_index;
/* Write the new slice psize bits */
- u64 lpsizes, hpsizes;
+ unsigned char *hpsizes;
+ u64 lpsizes;
unsigned long i, flags;
slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
@@ -201,157 +213,136 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
lpsizes = (lpsizes & ~(0xful << (i * 4))) |
(((unsigned long)psize) << (i * 4));
- hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++)
- if (mask.high_slices & (1u << i))
- hpsizes = (hpsizes & ~(0xful << (i * 4))) |
- (((unsigned long)psize) << (i * 4));
-
+ /* Assign the value back */
mm->context.low_slices_psize = lpsizes;
- mm->context.high_slices_psize = hpsizes;
+
+ hpsizes = mm->context.high_slices_psize;
+ for (i = 0; i < SLICE_NUM_HIGH; i++) {
+ mask_index = i & 0x1;
+ index = i >> 1;
+ if (mask.high_slices & (1ul << i))
+ hpsizes[index] = (hpsizes[index] &
+ ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
slice_dbg(" lsps=%lx, hsps=%lx\n",
mm->context.low_slices_psize,
mm->context.high_slices_psize);
spin_unlock_irqrestore(&slice_convert_lock, flags);
- mb();
- /* XXX this is sub-optimal but will do for now */
- on_each_cpu(slice_flush_segments, mm, 0, 1);
#ifdef CONFIG_SPU_BASE
spu_flush_all_slbs(mm);
#endif
}
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+ struct slice_mask available,
+ int end,
+ unsigned long *boundary_addr)
+{
+ unsigned long slice;
+ if (addr < SLICE_LOW_TOP) {
+ slice = GET_LOW_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+ return !!(available.low_slices & (1u << slice));
+ } else {
+ slice = GET_HIGH_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) ?
+ ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+ return !!(available.high_slices & (1ul << slice));
+ }
+}
+
static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize, int use_cache)
+ int psize)
{
- struct vm_area_struct *vma;
- unsigned long start_addr, addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
-
- if (use_cache) {
- if (len <= mm->cached_hole_size) {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- } else
- start_addr = addr = mm->free_area_cache;
- } else
- start_addr = addr = TASK_UNMAPPED_BASE;
-
-full_search:
- for (;;) {
- addr = _ALIGN_UP(addr, 1ul << pshift);
- if ((TASK_SIZE - len) < addr)
- break;
- vma = find_vma(mm, addr);
- BUG_ON(vma && (addr >= vma->vm_end));
-
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_LOW_SHIFT);
- else
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_HIGH_SHIFT);
+ unsigned long addr, found, next_end;
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
+
+ addr = TASK_UNMAPPED_BASE;
+ while (addr < TASK_SIZE) {
+ info.low_limit = addr;
+ if (!slice_scan_available(addr, available, 1, &addr))
continue;
+
+ next_slice:
+ /*
+ * At this point [info.low_limit; addr) covers
+ * available slices only and ends at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the next available slice.
+ */
+ if (addr >= TASK_SIZE)
+ addr = TASK_SIZE;
+ else if (slice_scan_available(addr, available, 1, &next_end)) {
+ addr = next_end;
+ goto next_slice;
}
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- if (use_cache)
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- }
+ info.high_limit = addr;
- /* Make sure we didn't miss any holes */
- if (use_cache && start_addr != TASK_UNMAPPED_BASE) {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- goto full_search;
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
+
return -ENOMEM;
}
static unsigned long slice_find_area_topdown(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize, int use_cache)
+ int psize)
{
- struct vm_area_struct *vma;
- unsigned long addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long addr, found, prev;
+ struct vm_unmapped_area_info info;
- /* check if free_area_cache is useful for us */
- if (use_cache) {
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
- /* either no address requested or can't fit in requested
- * address hole
- */
- addr = mm->free_area_cache;
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
- mask = slice_range_to_mask(addr, len);
- if (slice_check_fit(mask, available) &&
- slice_area_is_free(mm, addr, len))
- /* remember the address as a hint for
- * next time
- */
- return (mm->free_area_cache = addr);
- }
- }
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
addr = mm->mmap_base;
- while (addr > len) {
- /* Go down by chunk size */
- addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
-
- /* Check for hit with different page size */
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
- else if (addr < (1ul << SLICE_HIGH_SHIFT))
- addr = SLICE_LOW_TOP;
- else
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+ while (addr > PAGE_SIZE) {
+ info.high_limit = addr;
+ if (!slice_scan_available(addr - 1, available, 0, &addr))
continue;
- }
+ prev_slice:
/*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
+ * At this point [addr; info.high_limit) covers
+ * available slices only and starts at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the previous available slice.
*/
- vma = find_vma(mm, addr);
- if (!vma || (addr + len) <= vma->vm_start) {
- /* remember the address as a hint for next time */
- if (use_cache)
- mm->free_area_cache = addr;
- return addr;
+ if (addr < PAGE_SIZE)
+ addr = PAGE_SIZE;
+ else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+ addr = prev;
+ goto prev_slice;
}
+ info.low_limit = addr;
- /* remember the largest hole we saw so far */
- if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start;
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
/*
@@ -360,48 +351,55 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
* can happen with large stack limits and large mmap()
* allocations.
*/
- addr = slice_find_area_bottomup(mm, len, available, psize, 0);
-
- /*
- * Restore the topdown base:
- */
- if (use_cache) {
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
- }
-
- return addr;
+ return slice_find_area_bottomup(mm, len, available, psize);
}
static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
struct slice_mask mask, int psize,
- int topdown, int use_cache)
+ int topdown)
{
if (topdown)
- return slice_find_area_topdown(mm, len, mask, psize, use_cache);
+ return slice_find_area_topdown(mm, len, mask, psize);
else
- return slice_find_area_bottomup(mm, len, mask, psize, use_cache);
+ return slice_find_area_bottomup(mm, len, mask, psize);
}
+#define or_mask(dst, src) do { \
+ (dst).low_slices |= (src).low_slices; \
+ (dst).high_slices |= (src).high_slices; \
+} while (0)
+
+#define andnot_mask(dst, src) do { \
+ (dst).low_slices &= ~(src).low_slices; \
+ (dst).high_slices &= ~(src).high_slices; \
+} while (0)
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define MMU_PAGE_BASE MMU_PAGE_64K
+#else
+#define MMU_PAGE_BASE MMU_PAGE_4K
+#endif
+
unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
unsigned long flags, unsigned int psize,
- int topdown, int use_cache)
+ int topdown)
{
- struct slice_mask mask;
+ struct slice_mask mask = {0, 0};
struct slice_mask good_mask;
struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
- int pmask_set = 0;
+ struct slice_mask compat_mask = {0, 0};
int fixed = (flags & MAP_FIXED);
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
struct mm_struct *mm = current->mm;
+ unsigned long newaddr;
/* Sanity checks */
BUG_ON(mm->task_size == 0);
slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
- slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n",
- addr, len, flags, topdown, use_cache);
+ slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
+ addr, len, flags, topdown);
if (len > mm->task_size)
return -ENOMEM;
@@ -410,27 +408,54 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
if (fixed && (addr & ((1ul << pshift) - 1)))
return -EINVAL;
if (fixed && addr > (mm->task_size - len))
- return -EINVAL;
+ return -ENOMEM;
/* If hint, make sure it matches our alignment restrictions */
if (!fixed && addr) {
addr = _ALIGN_UP(addr, 1ul << pshift);
slice_dbg(" aligned addr=%lx\n", addr);
+ /* Ignore hint if it's too large or overlaps a VMA */
+ if (addr > mm->task_size - len ||
+ !slice_area_is_free(mm, addr, len))
+ addr = 0;
}
- /* First makeup a "good" mask of slices that have the right size
+ /* First make up a "good" mask of slices that have the right size
* already
*/
good_mask = slice_mask_for_size(mm, psize);
slice_print_mask(" good_mask", good_mask);
- /* First check hint if it's valid or if we have MAP_FIXED */
- if ((addr != 0 || fixed) && (mm->task_size - len) >= addr) {
+ /*
+ * Here "good" means slices that are already the right page size,
+ * "compat" means slices that have a compatible page size (i.e.
+ * 4k in a 64k pagesize kernel), and "free" means slices without
+ * any VMAs.
+ *
+ * If MAP_FIXED:
+ * check if fits in good | compat => OK
+ * check if fits in good | compat | free => convert free
+ * else bad
+ * If have hint:
+ * check if hint fits in good => OK
+ * check if hint fits in good | free => convert free
+ * Otherwise:
+ * search in good, found => OK
+ * search in good | free, found => convert free
+ * search in good | compat | free, found => convert free.
+ */
- /* Don't bother with hint if it overlaps a VMA */
- if (!fixed && !slice_area_is_free(mm, addr, len))
- goto search;
+#ifdef CONFIG_PPC_64K_PAGES
+ /* If we support combo pages, we can allow 64k pages in 4k slices */
+ if (psize == MMU_PAGE_64K) {
+ compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+ if (fixed)
+ or_mask(good_mask, compat_mask);
+ }
+#endif
+ /* First check hint if it's valid or if we have MAP_FIXED */
+ if (addr != 0 || fixed) {
/* Build a mask for the requested range */
mask = slice_range_to_mask(addr, len);
slice_print_mask(" mask", mask);
@@ -442,54 +467,63 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
slice_dbg(" fits good !\n");
return addr;
}
-
- /* We don't fit in the good mask, check what other slices are
- * empty and thus can be converted
+ } else {
+ /* Now let's see if we can find something in the existing
+ * slices for that size
*/
- potential_mask = slice_mask_for_free(mm);
- potential_mask.low_slices |= good_mask.low_slices;
- potential_mask.high_slices |= good_mask.high_slices;
- pmask_set = 1;
- slice_print_mask(" potential", potential_mask);
- if (slice_check_fit(mask, potential_mask)) {
- slice_dbg(" fits potential !\n");
- goto convert;
+ newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
+ if (newaddr != -ENOMEM) {
+ /* Found within the good mask, we don't have to setup,
+ * we thus return directly
+ */
+ slice_dbg(" found area at 0x%lx\n", newaddr);
+ return newaddr;
}
}
- /* If we have MAP_FIXED and failed the above step, then error out */
+ /* We don't fit in the good mask, check what other slices are
+ * empty and thus can be converted
+ */
+ potential_mask = slice_mask_for_free(mm);
+ or_mask(potential_mask, good_mask);
+ slice_print_mask(" potential", potential_mask);
+
+ if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) {
+ slice_dbg(" fits potential !\n");
+ goto convert;
+ }
+
+ /* If we have MAP_FIXED and failed the above steps, then error out */
if (fixed)
return -EBUSY;
- search:
slice_dbg(" search...\n");
- /* Now let's see if we can find something in the existing slices
- * for that size
+ /* If we had a hint that didn't work out, see if we can fit
+ * anywhere in the good area.
*/
- addr = slice_find_area(mm, len, good_mask, psize, topdown, use_cache);
- if (addr != -ENOMEM) {
- /* Found within the good mask, we don't have to setup,
- * we thus return directly
- */
- slice_dbg(" found area at 0x%lx\n", addr);
- return addr;
- }
-
- /* Won't fit, check what can be converted */
- if (!pmask_set) {
- potential_mask = slice_mask_for_free(mm);
- potential_mask.low_slices |= good_mask.low_slices;
- potential_mask.high_slices |= good_mask.high_slices;
- pmask_set = 1;
- slice_print_mask(" potential", potential_mask);
+ if (addr) {
+ addr = slice_find_area(mm, len, good_mask, psize, topdown);
+ if (addr != -ENOMEM) {
+ slice_dbg(" found area at 0x%lx\n", addr);
+ return addr;
+ }
}
/* Now let's see if we can find something in the existing slices
- * for that size
+ * for that size plus free slices
*/
- addr = slice_find_area(mm, len, potential_mask, psize, topdown,
- use_cache);
+ addr = slice_find_area(mm, len, potential_mask, psize, topdown);
+
+#ifdef CONFIG_PPC_64K_PAGES
+ if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
+ /* retry the search with 4k-page slices included */
+ or_mask(potential_mask, compat_mask);
+ addr = slice_find_area(mm, len, potential_mask, psize,
+ topdown);
+ }
+#endif
+
if (addr == -ENOMEM)
return -ENOMEM;
@@ -498,7 +532,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
slice_print_mask(" mask", mask);
convert:
- slice_convert(mm, mask, psize);
+ andnot_mask(mask, good_mask);
+ andnot_mask(mask, compat_mask);
+ if (mask.low_slices || mask.high_slices) {
+ slice_convert(mm, mask, psize);
+ if (psize > MMU_PAGE_BASE)
+ on_each_cpu(slice_flush_segments, mm, 1);
+ }
return addr;
}
@@ -511,8 +551,7 @@ unsigned long arch_get_unmapped_area(struct file *filp,
unsigned long flags)
{
return slice_get_unmapped_area(addr, len, flags,
- current->mm->context.user_psize,
- 0, 1);
+ current->mm->context.user_psize, 0);
}
unsigned long arch_get_unmapped_area_topdown(struct file *filp,
@@ -522,24 +561,24 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
const unsigned long flags)
{
return slice_get_unmapped_area(addr0, len, flags,
- current->mm->context.user_psize,
- 1, 1);
+ current->mm->context.user_psize, 1);
}
unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
{
- u64 psizes;
- int index;
+ unsigned char *hpsizes;
+ int index, mask_index;
if (addr < SLICE_LOW_TOP) {
- psizes = mm->context.low_slices_psize;
+ u64 lpsizes;
+ lpsizes = mm->context.low_slices_psize;
index = GET_LOW_SLICE_INDEX(addr);
- } else {
- psizes = mm->context.high_slices_psize;
- index = GET_HIGH_SLICE_INDEX(addr);
+ return (lpsizes >> (index * 4)) & 0xf;
}
-
- return (psizes >> (index * 4)) & 0xf;
+ hpsizes = mm->context.high_slices_psize;
+ index = GET_HIGH_SLICE_INDEX(addr);
+ mask_index = index & 0x1;
+ return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf;
}
EXPORT_SYMBOL_GPL(get_slice_psize);
@@ -559,7 +598,9 @@ EXPORT_SYMBOL_GPL(get_slice_psize);
*/
void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
{
- unsigned long flags, lpsizes, hpsizes;
+ int index, mask_index;
+ unsigned char *hpsizes;
+ unsigned long flags, lpsizes;
unsigned int old_psize;
int i;
@@ -580,15 +621,21 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
if (((lpsizes >> (i * 4)) & 0xf) == old_psize)
lpsizes = (lpsizes & ~(0xful << (i * 4))) |
(((unsigned long)psize) << (i * 4));
+ /* Assign the value back */
+ mm->context.low_slices_psize = lpsizes;
hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++)
- if (((hpsizes >> (i * 4)) & 0xf) == old_psize)
- hpsizes = (hpsizes & ~(0xful << (i * 4))) |
- (((unsigned long)psize) << (i * 4));
+ for (i = 0; i < SLICE_NUM_HIGH; i++) {
+ mask_index = i & 0x1;
+ index = i >> 1;
+ if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize)
+ hpsizes[index] = (hpsizes[index] &
+ ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
+
+
- mm->context.low_slices_psize = lpsizes;
- mm->context.high_slices_psize = hpsizes;
slice_dbg(" lsps=%lx, hsps=%lx\n",
mm->context.low_slices_psize,
@@ -598,8 +645,47 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
spin_unlock_irqrestore(&slice_convert_lock, flags);
}
+void slice_set_psize(struct mm_struct *mm, unsigned long address,
+ unsigned int psize)
+{
+ unsigned char *hpsizes;
+ unsigned long i, flags;
+ u64 *lpsizes;
+
+ spin_lock_irqsave(&slice_convert_lock, flags);
+ if (address < SLICE_LOW_TOP) {
+ i = GET_LOW_SLICE_INDEX(address);
+ lpsizes = &mm->context.low_slices_psize;
+ *lpsizes = (*lpsizes & ~(0xful << (i * 4))) |
+ ((unsigned long) psize << (i * 4));
+ } else {
+ int index, mask_index;
+ i = GET_HIGH_SLICE_INDEX(address);
+ hpsizes = mm->context.high_slices_psize;
+ mask_index = i & 0x1;
+ index = i >> 1;
+ hpsizes[index] = (hpsizes[index] &
+ ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
+
+ spin_unlock_irqrestore(&slice_convert_lock, flags);
+
+#ifdef CONFIG_SPU_BASE
+ spu_flush_all_slbs(mm);
+#endif
+}
+
+void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long len, unsigned int psize)
+{
+ struct slice_mask mask = slice_range_to_mask(start, len);
+
+ slice_convert(mm, mask, psize);
+}
+
/*
- * is_hugepage_only_range() is used by generic code to verify wether
+ * is_hugepage_only_range() is used by generic code to verify whether
* a normal mmap mapping (non hugetlbfs) is valid on a given area.
*
* until the generic code provides a more generic hook and/or starts
@@ -621,9 +707,18 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct slice_mask mask, available;
+ unsigned int psize = mm->context.user_psize;
mask = slice_range_to_mask(addr, len);
- available = slice_mask_for_size(mm, mm->context.user_psize);
+ available = slice_mask_for_size(mm, psize);
+#ifdef CONFIG_PPC_64K_PAGES
+ /* We need to account for 4k slices too */
+ if (psize == MMU_PAGE_64K) {
+ struct slice_mask compat_mask;
+ compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+ or_mask(available, compat_mask);
+ }
+#endif
#if 0 /* too verbose */
slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n",
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 50448d5de9d..3f8efa6f299 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -12,15 +12,14 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/memblock.h>
+
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/paca.h>
#include <asm/cputable.h>
-#include <asm/lmb.h>
-#include <asm/abs_addr.h>
-#include <asm/firmware.h>
-#include <asm/iseries/hv_call.h>
+#include <asm/prom.h>
struct stab_entry {
unsigned long esid_data;
@@ -28,8 +27,8 @@ struct stab_entry {
};
#define NR_STAB_CACHE_ENTRIES 8
-DEFINE_PER_CPU(long, stab_cache_ptr);
-DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+static DEFINE_PER_CPU(long, stab_cache_ptr);
+static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
/*
* Create a segment table entry for the given esid/vsid pair.
@@ -162,7 +161,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
{
struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
struct stab_entry *ste;
- unsigned long offset = __get_cpu_var(stab_cache_ptr);
+ unsigned long offset;
unsigned long pc = KSTK_EIP(tsk);
unsigned long stack = KSTK_ESP(tsk);
unsigned long unmapped_base;
@@ -170,6 +169,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
/* Force previous translations to complete. DRENG */
asm volatile("isync" : : : "memory");
+ /*
+ * We need interrupts hard-disabled here, not just soft-disabled,
+ * so that a PMU interrupt can't occur, which might try to access
+ * user memory (to get a stack trace) and possible cause an STAB miss
+ * which would update the stab_cache/stab_cache_ptr per-cpu variables.
+ */
+ hard_irq_disable();
+
+ offset = __get_cpu_var(stab_cache_ptr);
if (offset <= NR_STAB_CACHE_ENTRIES) {
int i;
@@ -232,7 +240,7 @@ void __init stabs_alloc(void)
{
int cpu;
- if (cpu_has_feature(CPU_FTR_SLB))
+ if (mmu_has_feature(MMU_FTR_SLB))
return;
for_each_possible_cpu(cpu) {
@@ -241,16 +249,16 @@ void __init stabs_alloc(void)
if (cpu == 0)
continue; /* stab for CPU 0 is statically allocated */
- newstab = lmb_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
+ newstab = memblock_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
1<<SID_SHIFT);
newstab = (unsigned long)__va(newstab);
memset((void *)newstab, 0, HW_PAGE_SIZE);
paca[cpu].stab_addr = newstab;
- paca[cpu].stab_real = virt_to_abs(newstab);
- printk(KERN_INFO "Segment table for CPU %d at 0x%lx "
- "virtual, 0x%lx absolute\n",
+ paca[cpu].stab_real = __pa(newstab);
+ printk(KERN_INFO "Segment table for CPU %d at 0x%llx "
+ "virtual, 0x%llx absolute\n",
cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
}
}
@@ -274,12 +282,5 @@ void stab_initialize(unsigned long stab)
/* Set ASR */
stabreal = get_paca()->stab_real | 0x1ul;
-#ifdef CONFIG_PPC_ISERIES
- if (firmware_has_feature(FW_FEATURE_ISERIES)) {
- HvCall1(HvCallBaseSetASR, stabreal);
- return;
- }
-#endif /* CONFIG_PPC_ISERIES */
-
mtspr(SPRN_ASR, stabreal);
}
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 4cafc0c33d0..6c0b1f5f8d2 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -10,7 +10,6 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
-#include <linux/slab.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
@@ -24,9 +23,9 @@
* Also makes sure that the subpage_prot_table structure is
* reinitialized for the next user.
*/
-void subpage_prot_free(pgd_t *pgd)
+void subpage_prot_free(struct mm_struct *mm)
{
- struct subpage_prot_table *spt = pgd_subpage_prot(pgd);
+ struct subpage_prot_table *spt = &mm->context.spt;
unsigned long i, j, addr;
u32 **p;
@@ -51,6 +50,13 @@ void subpage_prot_free(pgd_t *pgd)
spt->maxaddr = 0;
}
+void subpage_prot_init_new_context(struct mm_struct *mm)
+{
+ struct subpage_prot_table *spt = &mm->context.spt;
+
+ memset(spt, 0, sizeof(*spt));
+}
+
static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
int npages)
{
@@ -72,7 +78,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
for (; npages > 0; --npages) {
- pte_update(mm, addr, pte, 0, 0);
+ pte_update(mm, addr, pte, 0, 0, 0);
addr += PAGE_SIZE;
++pte;
}
@@ -87,9 +93,10 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
static void subpage_prot_clear(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
- struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+ struct subpage_prot_table *spt = &mm->context.spt;
u32 **spm, *spp;
- int i, nw;
+ unsigned long i;
+ size_t nw;
unsigned long next, limit;
down_write(&mm->mmap_sem);
@@ -98,7 +105,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
limit = spt->maxaddr;
for (; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
- if (addr < 0x100000000) {
+ if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
@@ -123,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
up_write(&mm->mmap_sem);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->private;
+ split_huge_page_pmd(vma, addr, pmd);
+ return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ struct vm_area_struct *vma;
+ struct mm_walk subpage_proto_walk = {
+ .mm = mm,
+ .pmd_entry = subpage_walk_pmd_entry,
+ };
+
+ /*
+ * We don't try too hard, we just mark all the vma in that range
+ * VM_NOHUGEPAGE and split them.
+ */
+ vma = find_vma(mm, addr);
+ /*
+ * If the range is in unmapped range, just return
+ */
+ if (vma && ((addr + len) <= vma->vm_start))
+ return;
+
+ while (vma) {
+ if (vma->vm_start >= (addr + len))
+ break;
+ vma->vm_flags |= VM_NOHUGEPAGE;
+ subpage_proto_walk.private = vma;
+ walk_page_range(vma->vm_start, vma->vm_end,
+ &subpage_proto_walk);
+ vma = vma->vm_next;
+ }
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ return;
+}
+#endif
+
/*
* Copy in a subpage protection map for an address range.
* The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -136,9 +190,10 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
{
struct mm_struct *mm = current->mm;
- struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+ struct subpage_prot_table *spt = &mm->context.spt;
u32 **spm, *spp;
- int i, nw;
+ unsigned long i;
+ size_t nw;
unsigned long next, limit;
int err;
@@ -160,10 +215,11 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
return -EFAULT;
down_write(&mm->mmap_sem);
+ subpage_mark_vma_nohuge(mm, addr, len);
for (limit = addr + len; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
err = -ENOMEM;
- if (addr < 0x100000000) {
+ if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_hash32.c
index eb4b512d65f..558e30cce33 100644
--- a/arch/powerpc/mm/tlb_32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -27,6 +27,7 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/export.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
@@ -45,6 +46,7 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
flush_hash_pages(mm->context.id, addr, ptephys, 1);
}
}
+EXPORT_SYMBOL(flush_hash_entry);
/*
* Called by ptep_set_access_flags, must flush on CPUs for which the
@@ -85,17 +87,6 @@ void tlb_flush(struct mmu_gather *tlb)
* -- Cort
*/
-/*
- * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
- * the cache operations on the bus. Hence we need to use an IPI
- * to get the other CPU(s) to invalidate their TLBs.
- */
-#ifdef CONFIG_SMP_750
-#define FINISH_FLUSH smp_send_tlb_invalidate(0)
-#else
-#define FINISH_FLUSH do { } while (0)
-#endif
-
static void flush_range(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
@@ -134,8 +125,8 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
flush_range(&init_mm, start, end);
- FINISH_FLUSH;
}
+EXPORT_SYMBOL(flush_tlb_kernel_range);
/*
* Flush all the (user) entries for the address space described by mm.
@@ -157,8 +148,8 @@ void flush_tlb_mm(struct mm_struct *mm)
*/
for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
- FINISH_FLUSH;
}
+EXPORT_SYMBOL(flush_tlb_mm);
void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
@@ -173,8 +164,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
if (!pmd_none(*pmd))
flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
- FINISH_FLUSH;
}
+EXPORT_SYMBOL(flush_tlb_page);
/*
* For each address in the range, find the pte for the address
@@ -185,5 +176,9 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
flush_range(vma->vm_mm, start, end);
- FINISH_FLUSH;
+}
+EXPORT_SYMBOL(flush_tlb_range);
+
+void __init early_init_mmu(void)
+{
}
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_hash64.c
index e2d867ce1c7..c99f6510a0b 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -23,7 +23,6 @@
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <asm/pgalloc.h>
@@ -33,102 +32,18 @@
DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
-/* This is declared as we are using the more or less generic
- * include/asm-powerpc/tlb.h file -- tgall
- */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
-unsigned long pte_freelist_forced_free;
-
-struct pte_freelist_batch
-{
- struct rcu_head rcu;
- unsigned int index;
- pgtable_free_t tables[0];
-};
-
-DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
-unsigned long pte_freelist_forced_free;
-
-#define PTE_FREELIST_SIZE \
- ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
- / sizeof(pgtable_free_t))
-
-static void pte_free_smp_sync(void *arg)
-{
- /* Do nothing, just ensure we sync with all CPUs */
-}
-
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-static void pgtable_free_now(pgtable_free_t pgf)
-{
- pte_freelist_forced_free++;
-
- smp_call_function(pte_free_smp_sync, NULL, 0, 1);
-
- pgtable_free(pgf);
-}
-
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
- struct pte_freelist_batch *batch =
- container_of(head, struct pte_freelist_batch, rcu);
- unsigned int i;
-
- for (i = 0; i < batch->index; i++)
- pgtable_free(batch->tables[i]);
-
- free_page((unsigned long)batch);
-}
-
-static void pte_free_submit(struct pte_freelist_batch *batch)
-{
- INIT_RCU_HEAD(&batch->rcu);
- call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
-
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
-{
- /* This is safe since tlb_gather_mmu has disabled preemption */
- cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
- struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
-
- if (atomic_read(&tlb->mm->mm_users) < 2 ||
- cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
- pgtable_free(pgf);
- return;
- }
-
- if (*batchp == NULL) {
- *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
- if (*batchp == NULL) {
- pgtable_free_now(pgf);
- return;
- }
- (*batchp)->index = 0;
- }
- (*batchp)->tables[(*batchp)->index++] = pgf;
- if ((*batchp)->index == PTE_FREELIST_SIZE) {
- pte_free_submit(*batchp);
- *batchp = NULL;
- }
-}
-
/*
* A linux PTE was changed and the corresponding hash table entry
* neesd to be flushed. This function will either perform the flush
* immediately or will batch it up if the current CPU has an active
* batch on it.
- *
- * Must be called from within some kind of spinlock/non-preempt region...
*/
void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long pte, int huge)
{
- struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
- unsigned long vsid, vaddr;
+ unsigned long vpn;
+ struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
+ unsigned long vsid;
unsigned int psize;
int ssize;
real_pte_t rpte;
@@ -136,11 +51,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
i = batch->index;
- /* We mask the address for the base page size. Huge pages will
- * have applied their own masking already
- */
- addr &= PAGE_MASK;
-
/* Get page size (maybe move back to caller).
*
* NOTE: when using special 64K mappings in 4K environment like
@@ -150,24 +60,33 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
*/
if (huge) {
#ifdef CONFIG_HUGETLB_PAGE
- psize = mmu_huge_psize;
+ psize = get_slice_psize(mm, addr);
+ /* Mask the address for the correct page size */
+ addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
#else
BUG();
psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
#endif
- } else
+ } else {
psize = pte_pagesize_index(mm, addr, pte);
+ /* Mask the address for the standard page size. If we
+ * have a 64k page kernel, but the hardware does not
+ * support 64k pages, this might be different from the
+ * hardware page size encoded in the slice table. */
+ addr &= PAGE_MASK;
+ }
+
/* Build full vaddr */
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
vsid = get_vsid(mm->context.id, addr, ssize);
- WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
- vaddr = hpt_va(addr, vsid, ssize);
+ WARN_ON(vsid == 0);
+ vpn = hpt_vpn(addr, vsid, ssize);
rpte = __real_pte(__pte(pte), ptep);
/*
@@ -177,7 +96,8 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
* and decide to use local invalidates instead...
*/
if (!batch->active) {
- flush_hash_page(vaddr, rpte, psize, ssize, 0);
+ flush_hash_page(vpn, rpte, psize, ssize, 0);
+ put_cpu_var(ppc64_tlb_batch);
return;
}
@@ -202,10 +122,11 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
batch->ssize = ssize;
}
batch->pte[i] = rpte;
- batch->vaddr[i] = vaddr;
+ batch->vpn[i] = vpn;
batch->index = ++i;
if (i >= PPC64_TLB_BATCH_NR)
__flush_tlb_pending(batch);
+ put_cpu_var(ppc64_tlb_batch);
}
/*
@@ -217,30 +138,33 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
*/
void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
{
- cpumask_t tmp;
+ const struct cpumask *tmp;
int i, local = 0;
i = batch->index;
- tmp = cpumask_of_cpu(smp_processor_id());
- if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
+ tmp = cpumask_of(smp_processor_id());
+ if (cpumask_equal(mm_cpumask(batch->mm), tmp))
local = 1;
if (i == 1)
- flush_hash_page(batch->vaddr[0], batch->pte[0],
+ flush_hash_page(batch->vpn[0], batch->pte[0],
batch->psize, batch->ssize, local);
else
flush_hash_range(i, local);
batch->index = 0;
}
-void pte_free_finish(void)
+void tlb_flush(struct mmu_gather *tlb)
{
- /* This is safe since tlb_gather_mmu has disabled preemption */
- struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+ struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
- if (*batchp == NULL)
- return;
- pte_free_submit(*batchp);
- *batchp = NULL;
+ /* If there's a TLB batch pending, then we must flush it because the
+ * pages are going to be freed and we really don't want to have a CPU
+ * access a freed page because it has a stale TLB
+ */
+ if (tlbbatch->index)
+ __flush_tlb_pending(tlbbatch);
+
+ put_cpu_var(ppc64_tlb_batch);
}
/**
@@ -258,14 +182,13 @@ void pte_free_finish(void)
* since 64K pages may overlap with other bridges when using 64K pages
* with 4K HW pages on IO space.
*
- * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
- * and is implemented for small size rather than speed.
+ * Because of that usage pattern, it is implemented for small size rather
+ * than speed.
*/
-#ifdef CONFIG_HOTPLUG
-
void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
+ int hugepage_shift;
unsigned long flags;
start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -283,7 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_linux_pte(mm->pgd, start);
+ pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
+ &hugepage_shift);
unsigned long pte;
if (ptep == NULL)
@@ -291,10 +215,38 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
pte = pte_val(*ptep);
if (!(pte & _PAGE_HASHPTE))
continue;
- hpte_need_flush(mm, start, ptep, pte, 0);
+ if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
+ hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+ else
+ hpte_need_flush(mm, start, ptep, pte, 0);
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}
-#endif /* CONFIG_HOTPLUG */
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+ pte_t *pte;
+ pte_t *start_pte;
+ unsigned long flags;
+
+ addr = _ALIGN_DOWN(addr, PMD_SIZE);
+ /* Note: Normally, we should only ever use a batch within a
+ * PTE locked section. This violates the rule, but will work
+ * since we don't actually modify the PTEs, we just flush the
+ * hash while leaving the PTEs intact (including their reference
+ * to being hashed). This is not the most performance oriented
+ * way to do things but is fine for our needs here.
+ */
+ local_irq_save(flags);
+ arch_enter_lazy_mmu_mode();
+ start_pte = pte_offset_map(pmd, addr);
+ for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+ unsigned long pteval = pte_val(*pte);
+ if (pteval & _PAGE_HASHPTE)
+ hpte_need_flush(mm, addr, pte, pteval, 0);
+ addr += PAGE_SIZE;
+ }
+ arch_leave_lazy_mmu_mode();
+ local_irq_restore(flags);
+}
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
new file mode 100644
index 00000000000..356e8b41fb0
--- /dev/null
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -0,0 +1,1171 @@
+/*
+ * Low level TLB miss handlers for Book3E
+ *
+ * Copyright (C) 2008-2009
+ * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/processor.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/pgtable.h>
+#include <asm/exception-64e.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1)
+#else
+#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE)
+#endif
+#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
+#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
+#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
+
+/**********************************************************************
+ * *
+ * TLB miss handling for Book3E with a bolted linear mapping *
+ * No virtual page table, no nested TLB misses *
+ * *
+ **********************************************************************/
+
+/*
+ * Note that, unlike non-bolted handlers, TLB_EXFRAME is not
+ * modified by the TLB miss handlers themselves, since the TLB miss
+ * handler code will not itself cause a recursive TLB miss.
+ *
+ * TLB_EXFRAME will be modified when crit/mc/debug exceptions are
+ * entered/exited.
+ */
+.macro tlb_prolog_bolted intnum addr
+ mtspr SPRN_SPRG_GEN_SCRATCH,r12
+ mfspr r12,SPRN_SPRG_TLB_EXFRAME
+ std r13,EX_TLB_R13(r12)
+ std r10,EX_TLB_R10(r12)
+ mfspr r13,SPRN_SPRG_PACA
+
+ mfcr r10
+ std r11,EX_TLB_R11(r12)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+ mfspr r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+ DO_KVM \intnum, SPRN_SRR1
+ std r16,EX_TLB_R16(r12)
+ mfspr r16,\addr /* get faulting address */
+ std r14,EX_TLB_R14(r12)
+ ld r14,PACAPGD(r13)
+ std r15,EX_TLB_R15(r12)
+ std r10,EX_TLB_CR(r12)
+ TLB_MISS_PROLOG_STATS
+.endm
+
+.macro tlb_epilog_bolted
+ ld r14,EX_TLB_CR(r12)
+ ld r10,EX_TLB_R10(r12)
+ ld r11,EX_TLB_R11(r12)
+ ld r13,EX_TLB_R13(r12)
+ mtcr r14
+ ld r14,EX_TLB_R14(r12)
+ ld r15,EX_TLB_R15(r12)
+ TLB_MISS_RESTORE_STATS
+ ld r16,EX_TLB_R16(r12)
+ mfspr r12,SPRN_SPRG_GEN_SCRATCH
+.endm
+
+/* Data TLB miss */
+ START_EXCEPTION(data_tlb_miss_bolted)
+ tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+ /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */
+
+ /* We do the user/kernel test for the PID here along with the RW test
+ */
+ /* We pre-test some combination of permissions to avoid double
+ * faults:
+ *
+ * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
+ * ESR_ST is 0x00800000
+ * _PAGE_BAP_SW is 0x00000010
+ * So the shift is >> 19. This tests for supervisor writeability.
+ * If the page happens to be supervisor writeable and not user
+ * writeable, we will take a new fault later, but that should be
+ * a rare enough case.
+ *
+ * We also move ESR_ST in _PAGE_DIRTY position
+ * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
+ *
+ * MAS1 is preset for all we need except for TID that needs to
+ * be cleared for kernel translations
+ */
+
+ mfspr r11,SPRN_ESR
+
+ srdi r15,r16,60 /* get region */
+ rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+ bne- dtlb_miss_fault_bolted /* Bail if fault addr is invalid */
+
+ rlwinm r10,r11,32-19,27,27
+ rlwimi r10,r11,32-16,19,19
+ cmpwi r15,0 /* user vs kernel check */
+ ori r10,r10,_PAGE_PRESENT
+ oris r11,r10,_PAGE_ACCESSED@h
+
+ TLB_MISS_STATS_SAVE_INFO_BOLTED
+ bne tlb_miss_kernel_bolted
+
+tlb_miss_common_bolted:
+/*
+ * This is the guts of the TLB miss handler for bolted-linear.
+ * We are entered with:
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = PTE permission mask
+ * r10 = crap (free to use)
+ */
+ rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+ cmpldi cr0,r14,0
+ clrrdi r15,r15,3
+ beq tlb_miss_fault_bolted /* No PGDIR, bail */
+
+BEGIN_MMU_FTR_SECTION
+ /* Set the TLB reservation and search for existing entry. Then load
+ * the entry.
+ */
+ PPC_TLBSRX_DOT(0,R16)
+ ldx r14,r14,r15 /* grab pgd entry */
+ beq tlb_miss_done_bolted /* tlb exists already, bail */
+MMU_FTR_SECTION_ELSE
+ ldx r14,r14,r15 /* grab pgd entry */
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
+
+#ifndef CONFIG_PPC_64K_PAGES
+ rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */
+ ldx r14,r14,r15 /* grab pud entry */
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_bolted
+ ldx r14,r14,r15 /* Grab pmd entry */
+
+ rldicl r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_bolted
+ ldx r14,r14,r15 /* Grab PTE, normal (!huge) page */
+
+ /* Check if required permissions are met */
+ andc. r15,r11,r14
+ rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+ bne- tlb_miss_fault_bolted
+
+ /* Now we build the MAS:
+ *
+ * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
+ * MAS 1 : Almost fully setup
+ * - PID already updated by caller if necessary
+ * - TSIZE need change if !base page size, not
+ * yet implemented for now
+ * MAS 2 : Defaults not useful, need to be redone
+ * MAS 3+7 : Needs to be done
+ */
+ clrrdi r11,r16,12 /* Clear low crap in EA */
+ clrldi r15,r15,12 /* Clear crap at the top */
+ rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */
+ rlwimi r15,r14,32-8,22,25 /* Move in U bits */
+ mtspr SPRN_MAS2,r11
+ andi. r11,r14,_PAGE_DIRTY
+ rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */
+
+ /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+ bne 1f
+ li r11,MAS3_SW|MAS3_UW
+ andc r15,r15,r11
+1:
+ mtspr SPRN_MAS7_MAS3,r15
+ tlbwe
+
+tlb_miss_done_bolted:
+ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+ tlb_epilog_bolted
+ rfi
+
+itlb_miss_kernel_bolted:
+ li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */
+ oris r11,r11,_PAGE_ACCESSED@h
+tlb_miss_kernel_bolted:
+ mfspr r10,SPRN_MAS1
+ ld r14,PACA_KERNELPGD(r13)
+ cmpldi cr0,r15,8 /* Check for vmalloc region */
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+ beq+ tlb_miss_common_bolted
+
+tlb_miss_fault_bolted:
+ /* We need to check if it was an instruction miss */
+ andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX
+ bne itlb_miss_fault_bolted
+dtlb_miss_fault_bolted:
+ TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+ tlb_epilog_bolted
+ b exc_data_storage_book3e
+itlb_miss_fault_bolted:
+ TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+ tlb_epilog_bolted
+ b exc_instruction_storage_book3e
+
+/* Instruction TLB miss */
+ START_EXCEPTION(instruction_tlb_miss_bolted)
+ tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+ rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+ srdi r15,r16,60 /* get region */
+ TLB_MISS_STATS_SAVE_INFO_BOLTED
+ bne- itlb_miss_fault_bolted
+
+ li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */
+
+ /* We do the user/kernel test for the PID here along with the RW test
+ */
+
+ cmpldi cr0,r15,0 /* Check for user region */
+ oris r11,r11,_PAGE_ACCESSED@h
+ beq tlb_miss_common_bolted
+ b itlb_miss_kernel_bolted
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/*
+ * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
+ *
+ * Linear mapping is bolted: no virtual page table or nested TLB misses
+ * Indirect entries in TLB1, hardware loads resulting direct entries
+ * into TLB0
+ * No HES or NV hint on TLB1, so we need to do software round-robin
+ * No tlbsrx. so we need a spinlock, and we have to deal
+ * with MAS-damage caused by tlbsx
+ * 4K pages only
+ */
+
+ START_EXCEPTION(instruction_tlb_miss_e6500)
+ tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+ ld r11,PACA_TCD_PTR(r13)
+ srdi. r15,r16,60 /* get region */
+ ori r16,r16,1
+
+ TLB_MISS_STATS_SAVE_INFO_BOLTED
+ bne tlb_miss_kernel_e6500 /* user/kernel test */
+
+ b tlb_miss_common_e6500
+
+ START_EXCEPTION(data_tlb_miss_e6500)
+ tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+ ld r11,PACA_TCD_PTR(r13)
+ srdi. r15,r16,60 /* get region */
+ rldicr r16,r16,0,62
+
+ TLB_MISS_STATS_SAVE_INFO_BOLTED
+ bne tlb_miss_kernel_e6500 /* user vs kernel check */
+
+/*
+ * This is the guts of the TLB miss handler for e6500 and derivatives.
+ * We are entered with:
+ *
+ * r16 = page of faulting address (low bit 0 if data, 1 if instruction)
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = tlb_per_core ptr
+ * r10 = cpu number
+ */
+tlb_miss_common_e6500:
+ /*
+ * Search if we already have an indirect entry for that virtual
+ * address, and if we do, bail out.
+ *
+ * MAS6:IND should be already set based on MAS4
+ */
+1: lbarx r15,0,r11
+ lhz r10,PACAPACAINDEX(r13)
+ cmpdi r15,0
+ cmpdi cr1,r15,1 /* set cr1.eq = 0 for non-recursive */
+ bne 2f
+ stbcx. r10,0,r11
+ bne 1b
+3:
+ .subsection 1
+2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */
+ beq cr1,3b /* unlock will happen if cr1.eq = 0 */
+ lbz r15,0(r11)
+ cmpdi r15,0
+ bne 2b
+ b 1b
+ .previous
+
+ mfspr r15,SPRN_MAS2
+
+ tlbsx 0,r16
+ mfspr r10,SPRN_MAS1
+ andis. r10,r10,MAS1_VALID@h
+ bne tlb_miss_done_e6500
+
+ /* Undo MAS-damage from the tlbsx */
+ mfspr r10,SPRN_MAS1
+ oris r10,r10,MAS1_VALID@h
+ mtspr SPRN_MAS1,r10
+ mtspr SPRN_MAS2,r15
+
+ /* Now, we need to walk the page tables. First check if we are in
+ * range.
+ */
+ rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+ bne- tlb_miss_fault_e6500
+
+ rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+ cmpldi cr0,r14,0
+ clrrdi r15,r15,3
+ beq- tlb_miss_fault_e6500 /* No PGDIR, bail */
+ ldx r14,r14,r15 /* grab pgd entry */
+
+ rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_e6500 /* Bad pgd entry or hugepage; bail */
+ ldx r14,r14,r15 /* grab pud entry */
+
+ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_e6500
+ ldx r14,r14,r15 /* Grab pmd entry */
+
+ mfspr r10,SPRN_MAS0
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_e6500
+
+ /* Now we build the MAS for a 2M indirect page:
+ *
+ * MAS 0 : ESEL needs to be filled by software round-robin
+ * MAS 1 : Fully set up
+ * - PID already updated by caller if necessary
+ * - TSIZE for now is base ind page size always
+ * - TID already cleared if necessary
+ * MAS 2 : Default not 2M-aligned, need to be redone
+ * MAS 3+7 : Needs to be done
+ */
+
+ ori r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+ mtspr SPRN_MAS7_MAS3,r14
+
+ clrrdi r15,r16,21 /* make EA 2M-aligned */
+ mtspr SPRN_MAS2,r15
+
+ lbz r15,TCD_ESEL_NEXT(r11)
+ lbz r16,TCD_ESEL_MAX(r11)
+ lbz r14,TCD_ESEL_FIRST(r11)
+ rlwimi r10,r15,16,0x00ff0000 /* insert esel_next into MAS0 */
+ addi r15,r15,1 /* increment esel_next */
+ mtspr SPRN_MAS0,r10
+ cmpw r15,r16
+ iseleq r15,r14,r15 /* if next == last use first */
+ stb r15,TCD_ESEL_NEXT(r11)
+
+ tlbwe
+
+tlb_miss_done_e6500:
+ .macro tlb_unlock_e6500
+ beq cr1,1f /* no unlock if lock was recursively grabbed */
+ li r15,0
+ isync
+ stb r15,0(r11)
+1:
+ .endm
+
+ tlb_unlock_e6500
+ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+ tlb_epilog_bolted
+ rfi
+
+tlb_miss_kernel_e6500:
+ mfspr r10,SPRN_MAS1
+ ld r14,PACA_KERNELPGD(r13)
+ cmpldi cr0,r15,8 /* Check for vmalloc region */
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+ beq+ tlb_miss_common_e6500
+
+tlb_miss_fault_e6500:
+ tlb_unlock_e6500
+ /* We need to check if it was an instruction miss */
+ andi. r16,r16,1
+ bne itlb_miss_fault_e6500
+dtlb_miss_fault_e6500:
+ TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+ tlb_epilog_bolted
+ b exc_data_storage_book3e
+itlb_miss_fault_e6500:
+ TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+ tlb_epilog_bolted
+ b exc_instruction_storage_book3e
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
+/**********************************************************************
+ * *
+ * TLB miss handling for Book3E with TLB reservation and HES support *
+ * *
+ **********************************************************************/
+
+
+/* Data TLB miss */
+ START_EXCEPTION(data_tlb_miss)
+ TLB_MISS_PROLOG
+
+ /* Now we handle the fault proper. We only save DEAR in normal
+ * fault case since that's the only interesting values here.
+ * We could probably also optimize by not saving SRR0/1 in the
+ * linear mapping case but I'll leave that for later
+ */
+ mfspr r14,SPRN_ESR
+ mfspr r16,SPRN_DEAR /* get faulting address */
+ srdi r15,r16,60 /* get region */
+ cmpldi cr0,r15,0xc /* linear mapping ? */
+ TLB_MISS_STATS_SAVE_INFO
+ beq tlb_load_linear /* yes -> go to linear map load */
+
+ /* The page tables are mapped virtually linear. At this point, though,
+ * we don't know whether we are trying to fault in a first level
+ * virtual address or a virtual page table address. We can get that
+ * from bit 0x1 of the region ID which we have set for a page table
+ */
+ andi. r10,r15,0x1
+ bne- virt_page_table_tlb_miss
+
+ std r14,EX_TLB_ESR(r12); /* save ESR */
+ std r16,EX_TLB_DEAR(r12); /* save DEAR */
+
+ /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */
+ li r11,_PAGE_PRESENT
+ oris r11,r11,_PAGE_ACCESSED@h
+
+ /* We do the user/kernel test for the PID here along with the RW test
+ */
+ cmpldi cr0,r15,0 /* Check for user region */
+
+ /* We pre-test some combination of permissions to avoid double
+ * faults:
+ *
+ * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
+ * ESR_ST is 0x00800000
+ * _PAGE_BAP_SW is 0x00000010
+ * So the shift is >> 19. This tests for supervisor writeability.
+ * If the page happens to be supervisor writeable and not user
+ * writeable, we will take a new fault later, but that should be
+ * a rare enough case.
+ *
+ * We also move ESR_ST in _PAGE_DIRTY position
+ * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
+ *
+ * MAS1 is preset for all we need except for TID that needs to
+ * be cleared for kernel translations
+ */
+ rlwimi r11,r14,32-19,27,27
+ rlwimi r11,r14,32-16,19,19
+ beq normal_tlb_miss
+ /* XXX replace the RMW cycles with immediate loads + writes */
+1: mfspr r10,SPRN_MAS1
+ cmpldi cr0,r15,8 /* Check for vmalloc region */
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+ beq+ normal_tlb_miss
+
+ /* We got a crappy address, just fault with whatever DEAR and ESR
+ * are here
+ */
+ TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+ TLB_MISS_EPILOG_ERROR
+ b exc_data_storage_book3e
+
+/* Instruction TLB miss */
+ START_EXCEPTION(instruction_tlb_miss)
+ TLB_MISS_PROLOG
+
+ /* If we take a recursive fault, the second level handler may need
+ * to know whether we are handling a data or instruction fault in
+ * order to get to the right store fault handler. We provide that
+ * info by writing a crazy value in ESR in our exception frame
+ */
+ li r14,-1 /* store to exception frame is done later */
+
+ /* Now we handle the fault proper. We only save DEAR in the non
+ * linear mapping case since we know the linear mapping case will
+ * not re-enter. We could indeed optimize and also not save SRR0/1
+ * in the linear mapping case but I'll leave that for later
+ *
+ * Faulting address is SRR0 which is already in r16
+ */
+ srdi r15,r16,60 /* get region */
+ cmpldi cr0,r15,0xc /* linear mapping ? */
+ TLB_MISS_STATS_SAVE_INFO
+ beq tlb_load_linear /* yes -> go to linear map load */
+
+ /* We do the user/kernel test for the PID here along with the RW test
+ */
+ li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */
+ oris r11,r11,_PAGE_ACCESSED@h
+
+ cmpldi cr0,r15,0 /* Check for user region */
+ std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */
+ beq normal_tlb_miss
+
+ li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */
+ oris r11,r11,_PAGE_ACCESSED@h
+ /* XXX replace the RMW cycles with immediate loads + writes */
+ mfspr r10,SPRN_MAS1
+ cmpldi cr0,r15,8 /* Check for vmalloc region */
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+ beq+ normal_tlb_miss
+
+ /* We got a crappy address, just fault */
+ TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+ TLB_MISS_EPILOG_ERROR
+ b exc_instruction_storage_book3e
+
+/*
+ * This is the guts of the first-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = faulting address
+ * r15 = region ID
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = PTE permission mask
+ * r10 = crap (free to use)
+ */
+normal_tlb_miss:
+ /* So we first construct the page table address. We do that by
+ * shifting the bottom of the address (not the region ID) by
+ * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
+ * or'ing the fourth high bit.
+ *
+ * NOTE: For 64K pages, we do things slightly differently in
+ * order to handle the weird page table format used by linux
+ */
+ ori r10,r15,0x1
+#ifdef CONFIG_PPC_64K_PAGES
+ /* For the top bits, 16 bytes per PTE */
+ rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
+ /* Now create the bottom bits as 0 in position 0x8000 and
+ * the rest calculated for 8 bytes per PTE
+ */
+ rldicl r15,r16,64-(PAGE_SHIFT-3),64-15
+ /* Insert the bottom bits in */
+ rlwimi r14,r15,0,16,31
+#else
+ rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
+#endif
+ sldi r15,r10,60
+ clrrdi r14,r14,3
+ or r10,r15,r14
+
+BEGIN_MMU_FTR_SECTION
+ /* Set the TLB reservation and search for existing entry. Then load
+ * the entry.
+ */
+ PPC_TLBSRX_DOT(0,R16)
+ ld r14,0(r10)
+ beq normal_tlb_miss_done
+MMU_FTR_SECTION_ELSE
+ ld r14,0(r10)
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
+
+finish_normal_tlb_miss:
+ /* Check if required permissions are met */
+ andc. r15,r11,r14
+ bne- normal_tlb_miss_access_fault
+
+ /* Now we build the MAS:
+ *
+ * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
+ * MAS 1 : Almost fully setup
+ * - PID already updated by caller if necessary
+ * - TSIZE need change if !base page size, not
+ * yet implemented for now
+ * MAS 2 : Defaults not useful, need to be redone
+ * MAS 3+7 : Needs to be done
+ *
+ * TODO: mix up code below for better scheduling
+ */
+ clrrdi r11,r16,12 /* Clear low crap in EA */
+ rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */
+ mtspr SPRN_MAS2,r11
+
+ /* Check page size, if not standard, update MAS1 */
+ rldicl r11,r14,64-8,64-8
+#ifdef CONFIG_PPC_64K_PAGES
+ cmpldi cr0,r11,BOOK3E_PAGESZ_64K
+#else
+ cmpldi cr0,r11,BOOK3E_PAGESZ_4K
+#endif
+ beq- 1f
+ mfspr r11,SPRN_MAS1
+ rlwimi r11,r14,31,21,24
+ rlwinm r11,r11,0,21,19
+ mtspr SPRN_MAS1,r11
+1:
+ /* Move RPN in position */
+ rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+ clrldi r15,r11,12 /* Clear crap at the top */
+ rlwimi r15,r14,32-8,22,25 /* Move in U bits */
+ rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */
+
+ /* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+ andi. r11,r14,_PAGE_DIRTY
+ bne 1f
+ li r11,MAS3_SW|MAS3_UW
+ andc r15,r15,r11
+1:
+BEGIN_MMU_FTR_SECTION
+ srdi r16,r15,32
+ mtspr SPRN_MAS3,r15
+ mtspr SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+ mtspr SPRN_MAS7_MAS3,r15
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+ tlbwe
+
+normal_tlb_miss_done:
+ /* We don't bother with restoring DEAR or ESR since we know we are
+ * level 0 and just going back to userland. They are only needed
+ * if you are going to take an access fault
+ */
+ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+ TLB_MISS_EPILOG_SUCCESS
+ rfi
+
+normal_tlb_miss_access_fault:
+ /* We need to check if it was an instruction miss */
+ andi. r10,r11,_PAGE_EXEC
+ bne 1f
+ ld r14,EX_TLB_DEAR(r12)
+ ld r15,EX_TLB_ESR(r12)
+ mtspr SPRN_DEAR,r14
+ mtspr SPRN_ESR,r15
+ TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+ TLB_MISS_EPILOG_ERROR
+ b exc_data_storage_book3e
+1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+ TLB_MISS_EPILOG_ERROR
+ b exc_instruction_storage_book3e
+
+
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = region (top 4 bits of address)
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * Note that this should only ever be called as a second level handler
+ * with the current scheme when using SW load.
+ * That means we can always get the original fault DEAR at
+ * EX_TLB_DEAR-EX_TLB_SIZE(r12)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will restart the whole fault at level
+ * 0 so we don't care too much about clobbers
+ *
+ * XXX That code was written back when we couldn't clobber r14. We can now,
+ * so we could probably optimize things a bit
+ */
+virt_page_table_tlb_miss:
+ /* Are we hitting a kernel page table ? */
+ andi. r10,r15,0x8
+
+ /* The cool thing now is that r10 contains 0 for user and 8 for kernel,
+ * and we happen to have the swapper_pg_dir at offset 8 from the user
+ * pgdir in the PACA :-).
+ */
+ add r11,r10,r13
+
+ /* If kernel, we need to clear MAS1 TID */
+ beq 1f
+ /* XXX replace the RMW cycles with immediate loads + writes */
+ mfspr r10,SPRN_MAS1
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+1:
+BEGIN_MMU_FTR_SECTION
+ /* Search if we already have a TLB entry for that virtual address, and
+ * if we do, bail out.
+ */
+ PPC_TLBSRX_DOT(0,R16)
+ beq virt_page_table_tlb_miss_done
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
+
+ /* Now, we need to walk the page tables. First check if we are in
+ * range.
+ */
+ rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
+ bne- virt_page_table_tlb_miss_fault
+
+ /* Get the PGD pointer */
+ ld r15,PACAPGD(r11)
+ cmpldi cr0,r15,0
+ beq- virt_page_table_tlb_miss_fault
+
+ /* Get to PGD entry */
+ rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
+ clrrdi r10,r11,3
+ ldx r15,r10,r15
+ cmpdi cr0,r15,0
+ bge virt_page_table_tlb_miss_fault
+
+#ifndef CONFIG_PPC_64K_PAGES
+ /* Get to PUD entry */
+ rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
+ clrrdi r10,r11,3
+ ldx r15,r10,r15
+ cmpdi cr0,r15,0
+ bge virt_page_table_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ /* Get to PMD entry */
+ rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
+ clrrdi r10,r11,3
+ ldx r15,r10,r15
+ cmpdi cr0,r15,0
+ bge virt_page_table_tlb_miss_fault
+
+ /* Ok, we're all right, we can now create a kernel translation for
+ * a 4K or 64K page from r16 -> r15.
+ */
+ /* Now we build the MAS:
+ *
+ * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
+ * MAS 1 : Almost fully setup
+ * - PID already updated by caller if necessary
+ * - TSIZE for now is base page size always
+ * MAS 2 : Use defaults
+ * MAS 3+7 : Needs to be done
+ *
+ * So we only do MAS 2 and 3 for now...
+ */
+ clrldi r11,r15,4 /* remove region ID from RPN */
+ ori r10,r11,1 /* Or-in SR */
+
+BEGIN_MMU_FTR_SECTION
+ srdi r16,r10,32
+ mtspr SPRN_MAS3,r10
+ mtspr SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+ mtspr SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+ tlbwe
+
+BEGIN_MMU_FTR_SECTION
+virt_page_table_tlb_miss_done:
+
+ /* We have overriden MAS2:EPN but currently our primary TLB miss
+ * handler will always restore it so that should not be an issue,
+ * if we ever optimize the primary handler to not write MAS2 on
+ * some cases, we'll have to restore MAS2:EPN here based on the
+ * original fault's DEAR. If we do that we have to modify the
+ * ITLB miss handler to also store SRR0 in the exception frame
+ * as DEAR.
+ *
+ * However, one nasty thing we did is we cleared the reservation
+ * (well, potentially we did). We do a trick here thus if we
+ * are not a level 0 exception (we interrupted the TLB miss) we
+ * offset the return address by -4 in order to replay the tlbsrx
+ * instruction there
+ */
+ subf r10,r13,r12
+ cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+ bne- 1f
+ ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+ addi r10,r11,-4
+ std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+1:
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
+ /* Return to caller, normal case */
+ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK);
+ TLB_MISS_EPILOG_SUCCESS
+ rfi
+
+virt_page_table_tlb_miss_fault:
+ /* If we fault here, things are a little bit tricky. We need to call
+ * either data or instruction store fault, and we need to retrieve
+ * the original fault address and ESR (for data).
+ *
+ * The thing is, we know that in normal circumstances, this is
+ * always called as a second level tlb miss for SW load or as a first
+ * level TLB miss for HW load, so we should be able to peek at the
+ * relevant information in the first exception frame in the PACA.
+ *
+ * However, we do need to double check that, because we may just hit
+ * a stray kernel pointer or a userland attack trying to hit those
+ * areas. If that is the case, we do a data fault. (We can't get here
+ * from an instruction tlb miss anyway).
+ *
+ * Note also that when going to a fault, we must unwind the previous
+ * level as well. Since we are doing that, we don't need to clear or
+ * restore the TLB reservation neither.
+ */
+ subf r10,r13,r12
+ cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+ bne- virt_page_table_tlb_miss_whacko_fault
+
+ /* We dig the original DEAR and ESR from slot 0 */
+ ld r15,EX_TLB_DEAR+PACA_EXTLB(r13)
+ ld r16,EX_TLB_ESR+PACA_EXTLB(r13)
+
+ /* We check for the "special" ESR value for instruction faults */
+ cmpdi cr0,r16,-1
+ beq 1f
+ mtspr SPRN_DEAR,r15
+ mtspr SPRN_ESR,r16
+ TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT);
+ TLB_MISS_EPILOG_ERROR
+ b exc_data_storage_book3e
+1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT);
+ TLB_MISS_EPILOG_ERROR
+ b exc_instruction_storage_book3e
+
+virt_page_table_tlb_miss_whacko_fault:
+ /* The linear fault will restart everything so ESR and DEAR will
+ * not have been clobbered, let's just fault with what we have
+ */
+ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT);
+ TLB_MISS_EPILOG_ERROR
+ b exc_data_storage_book3e
+
+
+/**************************************************************
+ * *
+ * TLB miss handling for Book3E with hw page table support *
+ * *
+ **************************************************************/
+
+
+/* Data TLB miss */
+ START_EXCEPTION(data_tlb_miss_htw)
+ TLB_MISS_PROLOG
+
+ /* Now we handle the fault proper. We only save DEAR in normal
+ * fault case since that's the only interesting values here.
+ * We could probably also optimize by not saving SRR0/1 in the
+ * linear mapping case but I'll leave that for later
+ */
+ mfspr r14,SPRN_ESR
+ mfspr r16,SPRN_DEAR /* get faulting address */
+ srdi r11,r16,60 /* get region */
+ cmpldi cr0,r11,0xc /* linear mapping ? */
+ TLB_MISS_STATS_SAVE_INFO
+ beq tlb_load_linear /* yes -> go to linear map load */
+
+ /* We do the user/kernel test for the PID here along with the RW test
+ */
+ cmpldi cr0,r11,0 /* Check for user region */
+ ld r15,PACAPGD(r13) /* Load user pgdir */
+ beq htw_tlb_miss
+
+ /* XXX replace the RMW cycles with immediate loads + writes */
+1: mfspr r10,SPRN_MAS1
+ cmpldi cr0,r11,8 /* Check for vmalloc region */
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+ ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
+ beq+ htw_tlb_miss
+
+ /* We got a crappy address, just fault with whatever DEAR and ESR
+ * are here
+ */
+ TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+ TLB_MISS_EPILOG_ERROR
+ b exc_data_storage_book3e
+
+/* Instruction TLB miss */
+ START_EXCEPTION(instruction_tlb_miss_htw)
+ TLB_MISS_PROLOG
+
+ /* If we take a recursive fault, the second level handler may need
+ * to know whether we are handling a data or instruction fault in
+ * order to get to the right store fault handler. We provide that
+ * info by keeping a crazy value for ESR in r14
+ */
+ li r14,-1 /* store to exception frame is done later */
+
+ /* Now we handle the fault proper. We only save DEAR in the non
+ * linear mapping case since we know the linear mapping case will
+ * not re-enter. We could indeed optimize and also not save SRR0/1
+ * in the linear mapping case but I'll leave that for later
+ *
+ * Faulting address is SRR0 which is already in r16
+ */
+ srdi r11,r16,60 /* get region */
+ cmpldi cr0,r11,0xc /* linear mapping ? */
+ TLB_MISS_STATS_SAVE_INFO
+ beq tlb_load_linear /* yes -> go to linear map load */
+
+ /* We do the user/kernel test for the PID here along with the RW test
+ */
+ cmpldi cr0,r11,0 /* Check for user region */
+ ld r15,PACAPGD(r13) /* Load user pgdir */
+ beq htw_tlb_miss
+
+ /* XXX replace the RMW cycles with immediate loads + writes */
+1: mfspr r10,SPRN_MAS1
+ cmpldi cr0,r11,8 /* Check for vmalloc region */
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+ ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */
+ beq+ htw_tlb_miss
+
+ /* We got a crappy address, just fault */
+ TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+ TLB_MISS_EPILOG_ERROR
+ b exc_instruction_storage_book3e
+
+
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = PGD pointer
+ * r14 = ESR
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will save/restore things for us
+ */
+htw_tlb_miss:
+ /* Search if we already have a TLB entry for that virtual address, and
+ * if we do, bail out.
+ *
+ * MAS1:IND should be already set based on MAS4
+ */
+ PPC_TLBSRX_DOT(0,R16)
+ beq htw_tlb_miss_done
+
+ /* Now, we need to walk the page tables. First check if we are in
+ * range.
+ */
+ rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+ bne- htw_tlb_miss_fault
+
+ /* Get the PGD pointer */
+ cmpldi cr0,r15,0
+ beq- htw_tlb_miss_fault
+
+ /* Get to PGD entry */
+ rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
+ clrrdi r10,r11,3
+ ldx r15,r10,r15
+ cmpdi cr0,r15,0
+ bge htw_tlb_miss_fault
+
+#ifndef CONFIG_PPC_64K_PAGES
+ /* Get to PUD entry */
+ rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
+ clrrdi r10,r11,3
+ ldx r15,r10,r15
+ cmpdi cr0,r15,0
+ bge htw_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ /* Get to PMD entry */
+ rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
+ clrrdi r10,r11,3
+ ldx r15,r10,r15
+ cmpdi cr0,r15,0
+ bge htw_tlb_miss_fault
+
+ /* Ok, we're all right, we can now create an indirect entry for
+ * a 1M or 256M page.
+ *
+ * The last trick is now that because we use "half" pages for
+ * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
+ * for an added LSB bit to the RPN. For 64K pages, there is no
+ * problem as we already use 32K arrays (half PTE pages), but for
+ * 4K page we need to extract a bit from the virtual address and
+ * insert it into the "PA52" bit of the RPN.
+ */
+#ifndef CONFIG_PPC_64K_PAGES
+ rlwimi r15,r16,32-9,20,20
+#endif
+ /* Now we build the MAS:
+ *
+ * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
+ * MAS 1 : Almost fully setup
+ * - PID already updated by caller if necessary
+ * - TSIZE for now is base ind page size always
+ * MAS 2 : Use defaults
+ * MAS 3+7 : Needs to be done
+ */
+#ifdef CONFIG_PPC_64K_PAGES
+ ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
+#else
+ ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+#endif
+
+BEGIN_MMU_FTR_SECTION
+ srdi r16,r10,32
+ mtspr SPRN_MAS3,r10
+ mtspr SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+ mtspr SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+ tlbwe
+
+htw_tlb_miss_done:
+ /* We don't bother with restoring DEAR or ESR since we know we are
+ * level 0 and just going back to userland. They are only needed
+ * if you are going to take an access fault
+ */
+ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK)
+ TLB_MISS_EPILOG_SUCCESS
+ rfi
+
+htw_tlb_miss_fault:
+ /* We need to check if it was an instruction miss. We know this
+ * though because r14 would contain -1
+ */
+ cmpdi cr0,r14,-1
+ beq 1f
+ mtspr SPRN_DEAR,r16
+ mtspr SPRN_ESR,r14
+ TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT)
+ TLB_MISS_EPILOG_ERROR
+ b exc_data_storage_book3e
+1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT)
+ TLB_MISS_EPILOG_ERROR
+ b exc_instruction_storage_book3e
+
+/*
+ * This is the guts of "any" level TLB miss handler for kernel linear
+ * mapping misses. We are entered with:
+ *
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = ESR (data) or -1 (instruction)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * In addition we know that we will not re-enter, so in theory, we could
+ * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later.
+ *
+ * We also need to be careful about MAS registers here & TLB reservation,
+ * as we know we'll have clobbered them if we interrupt the main TLB miss
+ * handlers in which case we probably want to do a full restart at level
+ * 0 rather than saving / restoring the MAS.
+ *
+ * Note: If we care about performance of that core, we can easily shuffle
+ * a few things around
+ */
+tlb_load_linear:
+ /* For now, we assume the linear mapping is contiguous and stops at
+ * linear_map_top. We also assume the size is a multiple of 1G, thus
+ * we only use 1G pages for now. That might have to be changed in a
+ * final implementation, especially when dealing with hypervisors
+ */
+ ld r11,PACATOC(r13)
+ ld r11,linear_map_top@got(r11)
+ ld r10,0(r11)
+ tovirt(10,10)
+ cmpld cr0,r16,r10
+ bge tlb_load_linear_fault
+
+ /* MAS1 need whole new setup. */
+ li r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT)
+ oris r15,r15,MAS1_VALID@h /* MAS1 needs V and TSIZE */
+ mtspr SPRN_MAS1,r15
+
+ /* Already somebody there ? */
+ PPC_TLBSRX_DOT(0,R16)
+ beq tlb_load_linear_done
+
+ /* Now we build the remaining MAS. MAS0 and 2 should be fine
+ * with their defaults, which leaves us with MAS 3 and 7. The
+ * mapping is linear, so we just take the address, clear the
+ * region bits, and or in the permission bits which are currently
+ * hard wired
+ */
+ clrrdi r10,r16,30 /* 1G page index */
+ clrldi r10,r10,4 /* clear region bits */
+ ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
+
+BEGIN_MMU_FTR_SECTION
+ srdi r16,r10,32
+ mtspr SPRN_MAS3,r10
+ mtspr SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+ mtspr SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+ tlbwe
+
+tlb_load_linear_done:
+ /* We use the "error" epilog for success as we do want to
+ * restore to the initial faulting context, whatever it was.
+ * We do that because we can't resume a fault within a TLB
+ * miss handler, due to MAS and TLB reservation being clobbered.
+ */
+ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR)
+ TLB_MISS_EPILOG_ERROR
+ rfi
+
+tlb_load_linear_fault:
+ /* We keep the DEAR and ESR around, this shouldn't have happened */
+ cmpdi cr0,r14,-1
+ beq 1f
+ TLB_MISS_EPILOG_ERROR_SPECIAL
+ b exc_data_storage_book3e
+1: TLB_MISS_EPILOG_ERROR_SPECIAL
+ b exc_instruction_storage_book3e
+
+
+#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
+.tlb_stat_inc:
+1: ldarx r8,0,r9
+ addi r8,r8,1
+ stdcx. r8,0,r9
+ bne- 1b
+ blr
+#endif
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
new file mode 100644
index 00000000000..92cb18d52ea
--- /dev/null
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -0,0 +1,732 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU does not use a hash table to store virtual to
+ * physical translations (ie, SW loaded TLBs or Book3E compilant processors,
+ * this does -not- include 603 however which shares the implementation with
+ * hash based processors)
+ *
+ * -- BenH
+ *
+ * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
+ * IBM Corp.
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/code-patching.h>
+#include <asm/hugetlb.h>
+#include <asm/paca.h>
+
+#include "mmu_decl.h"
+
+/*
+ * This struct lists the sw-supported page sizes. The hardawre MMU may support
+ * other sizes not listed here. The .ind field is only used on MMUs that have
+ * indirect page table entries.
+ */
+#ifdef CONFIG_PPC_BOOK3E_MMU
+#ifdef CONFIG_PPC_FSL_BOOK3E
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+ [MMU_PAGE_4K] = {
+ .shift = 12,
+ .enc = BOOK3E_PAGESZ_4K,
+ },
+ [MMU_PAGE_2M] = {
+ .shift = 21,
+ .enc = BOOK3E_PAGESZ_2M,
+ },
+ [MMU_PAGE_4M] = {
+ .shift = 22,
+ .enc = BOOK3E_PAGESZ_4M,
+ },
+ [MMU_PAGE_16M] = {
+ .shift = 24,
+ .enc = BOOK3E_PAGESZ_16M,
+ },
+ [MMU_PAGE_64M] = {
+ .shift = 26,
+ .enc = BOOK3E_PAGESZ_64M,
+ },
+ [MMU_PAGE_256M] = {
+ .shift = 28,
+ .enc = BOOK3E_PAGESZ_256M,
+ },
+ [MMU_PAGE_1G] = {
+ .shift = 30,
+ .enc = BOOK3E_PAGESZ_1GB,
+ },
+};
+#else
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+ [MMU_PAGE_4K] = {
+ .shift = 12,
+ .ind = 20,
+ .enc = BOOK3E_PAGESZ_4K,
+ },
+ [MMU_PAGE_16K] = {
+ .shift = 14,
+ .enc = BOOK3E_PAGESZ_16K,
+ },
+ [MMU_PAGE_64K] = {
+ .shift = 16,
+ .ind = 28,
+ .enc = BOOK3E_PAGESZ_64K,
+ },
+ [MMU_PAGE_1M] = {
+ .shift = 20,
+ .enc = BOOK3E_PAGESZ_1M,
+ },
+ [MMU_PAGE_16M] = {
+ .shift = 24,
+ .ind = 36,
+ .enc = BOOK3E_PAGESZ_16M,
+ },
+ [MMU_PAGE_256M] = {
+ .shift = 28,
+ .enc = BOOK3E_PAGESZ_256M,
+ },
+ [MMU_PAGE_1G] = {
+ .shift = 30,
+ .enc = BOOK3E_PAGESZ_1GB,
+ },
+};
+#endif /* CONFIG_FSL_BOOKE */
+
+static inline int mmu_get_tsize(int psize)
+{
+ return mmu_psize_defs[psize].enc;
+}
+#else
+static inline int mmu_get_tsize(int psize)
+{
+ /* This isn't used on !Book3E for now */
+ return 0;
+}
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+/* The variables below are currently only used on 64-bit Book3E
+ * though this will probably be made common with other nohash
+ * implementations at some point
+ */
+#ifdef CONFIG_PPC64
+
+int mmu_linear_psize; /* Page size used for the linear mapping */
+int mmu_pte_psize; /* Page size used for PTE pages */
+int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
+int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
+unsigned long linear_map_top; /* Top of linear mapping */
+
+
+/*
+ * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
+ * exceptions. This is used for bolted and e6500 TLB miss handlers which
+ * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
+ * this is set to zero.
+ */
+int extlb_level_exc;
+
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
+DEFINE_PER_CPU(int, next_tlbcam_idx);
+EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
+#endif
+
+/*
+ * Base TLB flushing operations:
+ *
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * - local_* variants of page and mm only apply to the current
+ * processor
+ */
+
+/*
+ * These are the base non-SMP variants of page and mm flushing
+ */
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (pid != MMU_NO_CONTEXT)
+ _tlbil_pid(pid);
+ preempt_enable();
+}
+EXPORT_SYMBOL(local_flush_tlb_mm);
+
+void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+ int tsize, int ind)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm ? mm->context.id : 0;
+ if (pid != MMU_NO_CONTEXT)
+ _tlbil_va(vmaddr, pid, tsize, ind);
+ preempt_enable();
+}
+
+void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+ mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(local_flush_tlb_page);
+
+/*
+ * And here are the SMP non-local implementations
+ */
+#ifdef CONFIG_SMP
+
+static DEFINE_RAW_SPINLOCK(tlbivax_lock);
+
+static int mm_is_core_local(struct mm_struct *mm)
+{
+ return cpumask_subset(mm_cpumask(mm),
+ topology_thread_cpumask(smp_processor_id()));
+}
+
+struct tlb_flush_param {
+ unsigned long addr;
+ unsigned int pid;
+ unsigned int tsize;
+ unsigned int ind;
+};
+
+static void do_flush_tlb_mm_ipi(void *param)
+{
+ struct tlb_flush_param *p = param;
+
+ _tlbil_pid(p ? p->pid : 0);
+}
+
+static void do_flush_tlb_page_ipi(void *param)
+{
+ struct tlb_flush_param *p = param;
+
+ _tlbil_va(p->addr, p->pid, p->tsize, p->ind);
+}
+
+
+/* Note on invalidations and PID:
+ *
+ * We snapshot the PID with preempt disabled. At this point, it can still
+ * change either because:
+ * - our context is being stolen (PID -> NO_CONTEXT) on another CPU
+ * - we are invaliating some target that isn't currently running here
+ * and is concurrently acquiring a new PID on another CPU
+ * - some other CPU is re-acquiring a lost PID for this mm
+ * etc...
+ *
+ * However, this shouldn't be a problem as we only guarantee
+ * invalidation of TLB entries present prior to this call, so we
+ * don't care about the PID changing, and invalidating a stale PID
+ * is generally harmless.
+ */
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ goto no_context;
+ if (!mm_is_core_local(mm)) {
+ struct tlb_flush_param p = { .pid = pid };
+ /* Ignores smp_processor_id() even if set. */
+ smp_call_function_many(mm_cpumask(mm),
+ do_flush_tlb_mm_ipi, &p, 1);
+ }
+ _tlbil_pid(pid);
+ no_context:
+ preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_mm);
+
+void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+ int tsize, int ind)
+{
+ struct cpumask *cpu_mask;
+ unsigned int pid;
+
+ preempt_disable();
+ pid = mm ? mm->context.id : 0;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ goto bail;
+ cpu_mask = mm_cpumask(mm);
+ if (!mm_is_core_local(mm)) {
+ /* If broadcast tlbivax is supported, use it */
+ if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
+ int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
+ if (lock)
+ raw_spin_lock(&tlbivax_lock);
+ _tlbivax_bcast(vmaddr, pid, tsize, ind);
+ if (lock)
+ raw_spin_unlock(&tlbivax_lock);
+ goto bail;
+ } else {
+ struct tlb_flush_param p = {
+ .pid = pid,
+ .addr = vmaddr,
+ .tsize = tsize,
+ .ind = ind,
+ };
+ /* Ignores smp_processor_id() even if set in cpu_mask */
+ smp_call_function_many(cpu_mask,
+ do_flush_tlb_page_ipi, &p, 1);
+ }
+ }
+ _tlbil_va(vmaddr, pid, tsize, ind);
+ bail:
+ preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (vma && is_vm_hugetlb_page(vma))
+ flush_hugetlb_page(vma, vmaddr);
+#endif
+
+ __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+ mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_PPC_47x
+void __init early_init_mmu_47x(void)
+{
+#ifdef CONFIG_SMP
+ unsigned long root = of_get_flat_dt_root();
+ if (of_get_flat_dt_prop(root, "cooperative-partition", NULL))
+ mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
+#endif /* CONFIG_SMP */
+}
+#endif /* CONFIG_PPC_47x */
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+#ifdef CONFIG_SMP
+ preempt_disable();
+ smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
+ _tlbil_pid(0);
+ preempt_enable();
+#else
+ _tlbil_pid(0);
+#endif
+}
+EXPORT_SYMBOL(flush_tlb_kernel_range);
+
+/*
+ * Currently, for range flushing, we just do a full mm flush. This should
+ * be optimized based on a threshold on the size of the range, since
+ * some implementation can stack multiple tlbivax before a tlbsync but
+ * for now, we keep it that way
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+
+{
+ flush_tlb_mm(vma->vm_mm);
+}
+EXPORT_SYMBOL(flush_tlb_range);
+
+void tlb_flush(struct mmu_gather *tlb)
+{
+ flush_tlb_mm(tlb->mm);
+}
+
+/*
+ * Below are functions specific to the 64-bit variant of Book3E though that
+ * may change in the future
+ */
+
+#ifdef CONFIG_PPC64
+
+/*
+ * Handling of virtual linear page tables or indirect TLB entries
+ * flushing when PTE pages are freed
+ */
+void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
+{
+ int tsize = mmu_psize_defs[mmu_pte_psize].enc;
+
+ if (book3e_htw_mode != PPC_HTW_NONE) {
+ unsigned long start = address & PMD_MASK;
+ unsigned long end = address + PMD_SIZE;
+ unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
+
+ /* This isn't the most optimal, ideally we would factor out the
+ * while preempt & CPU mask mucking around, or even the IPI but
+ * it will do for now
+ */
+ while (start < end) {
+ __flush_tlb_page(tlb->mm, start, tsize, 1);
+ start += size;
+ }
+ } else {
+ unsigned long rmask = 0xf000000000000000ul;
+ unsigned long rid = (address & rmask) | 0x1000000000000000ul;
+ unsigned long vpte = address & ~rmask;
+
+#ifdef CONFIG_PPC_64K_PAGES
+ vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
+#else
+ vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
+#endif
+ vpte |= rid;
+ __flush_tlb_page(tlb->mm, vpte, tsize, 0);
+ }
+}
+
+static void setup_page_sizes(void)
+{
+ unsigned int tlb0cfg;
+ unsigned int tlb0ps;
+ unsigned int eptcfg;
+ int i, psize;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ unsigned int mmucfg = mfspr(SPRN_MMUCFG);
+ int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
+
+ if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+ unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
+ unsigned int min_pg, max_pg;
+
+ min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
+ max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def;
+ unsigned int shift;
+
+ def = &mmu_psize_defs[psize];
+ shift = def->shift;
+
+ if (shift == 0 || shift & 1)
+ continue;
+
+ /* adjust to be in terms of 4^shift Kb */
+ shift = (shift - 10) >> 1;
+
+ if ((shift >= min_pg) && (shift <= max_pg))
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+ }
+
+ goto out;
+ }
+
+ if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
+ u32 tlb1cfg, tlb1ps;
+
+ tlb0cfg = mfspr(SPRN_TLB0CFG);
+ tlb1cfg = mfspr(SPRN_TLB1CFG);
+ tlb1ps = mfspr(SPRN_TLB1PS);
+ eptcfg = mfspr(SPRN_EPTCFG);
+
+ if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+ book3e_htw_mode = PPC_HTW_E6500;
+
+ /*
+ * We expect 4K subpage size and unrestricted indirect size.
+ * The lack of a restriction on indirect size is a Freescale
+ * extension, indicated by PSn = 0 but SPSn != 0.
+ */
+ if (eptcfg != 2)
+ book3e_htw_mode = PPC_HTW_NONE;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (tlb1ps & (1U << (def->shift - 10))) {
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+ if (book3e_htw_mode && psize == MMU_PAGE_2M)
+ def->flags |= MMU_PAGE_SIZE_INDIRECT;
+ }
+ }
+
+ goto out;
+ }
+#endif
+
+ tlb0cfg = mfspr(SPRN_TLB0CFG);
+ tlb0ps = mfspr(SPRN_TLB0PS);
+ eptcfg = mfspr(SPRN_EPTCFG);
+
+ /* Look for supported direct sizes */
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (tlb0ps & (1U << (def->shift - 10)))
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+ }
+
+ /* Indirect page sizes supported ? */
+ if ((tlb0cfg & TLBnCFG_IND) == 0 ||
+ (tlb0cfg & TLBnCFG_PT) == 0)
+ goto out;
+
+ book3e_htw_mode = PPC_HTW_IBM;
+
+ /* Now, we only deal with one IND page size for each
+ * direct size. Hopefully all implementations today are
+ * unambiguous, but we might want to be careful in the
+ * future.
+ */
+ for (i = 0; i < 3; i++) {
+ unsigned int ps, sps;
+
+ sps = eptcfg & 0x1f;
+ eptcfg >>= 5;
+ ps = eptcfg & 0x1f;
+ eptcfg >>= 5;
+ if (!ps || !sps)
+ continue;
+ for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (ps == (def->shift - 10))
+ def->flags |= MMU_PAGE_SIZE_INDIRECT;
+ if (sps == (def->shift - 10))
+ def->ind = ps + 10;
+ }
+ }
+
+out:
+ /* Cleanup array and print summary */
+ pr_info("MMU: Supported page sizes\n");
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+ const char *__page_type_names[] = {
+ "unsupported",
+ "direct",
+ "indirect",
+ "direct & indirect"
+ };
+ if (def->flags == 0) {
+ def->shift = 0;
+ continue;
+ }
+ pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10),
+ __page_type_names[def->flags & 0x3]);
+ }
+}
+
+static void setup_mmu_htw(void)
+{
+ /*
+ * If we want to use HW tablewalk, enable it by patching the TLB miss
+ * handlers to branch to the one dedicated to it.
+ */
+
+ switch (book3e_htw_mode) {
+ case PPC_HTW_IBM:
+ patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
+ patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
+ break;
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ case PPC_HTW_E6500:
+ extlb_level_exc = EX_TLB_SIZE;
+ patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+ patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+ break;
+#endif
+ }
+ pr_info("MMU: Book3E HW tablewalk %s\n",
+ book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void __early_init_mmu(int boot_cpu)
+{
+ unsigned int mas4;
+
+ /* XXX This will have to be decided at runtime, but right
+ * now our boot and TLB miss code hard wires it. Ideally
+ * we should find out a suitable page size and patch the
+ * TLB miss code (either that or use the PACA to store
+ * the value we want)
+ */
+ mmu_linear_psize = MMU_PAGE_1G;
+
+ /* XXX This should be decided at runtime based on supported
+ * page sizes in the TLB, but for now let's assume 16M is
+ * always there and a good fit (which it probably is)
+ *
+ * Freescale booke only supports 4K pages in TLB0, so use that.
+ */
+ if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+ mmu_vmemmap_psize = MMU_PAGE_4K;
+ else
+ mmu_vmemmap_psize = MMU_PAGE_16M;
+
+ /* XXX This code only checks for TLB 0 capabilities and doesn't
+ * check what page size combos are supported by the HW. It
+ * also doesn't handle the case where a separate array holds
+ * the IND entries from the array loaded by the PT.
+ */
+ if (boot_cpu) {
+ /* Look for supported page sizes */
+ setup_page_sizes();
+
+ /* Look for HW tablewalk support */
+ setup_mmu_htw();
+ }
+
+ /* Set MAS4 based on page table setting */
+
+ mas4 = 0x4 << MAS4_WIMGED_SHIFT;
+ switch (book3e_htw_mode) {
+ case PPC_HTW_E6500:
+ mas4 |= MAS4_INDD;
+ mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+ mas4 |= MAS4_TLBSELD(1);
+ mmu_pte_psize = MMU_PAGE_2M;
+ break;
+
+ case PPC_HTW_IBM:
+ mas4 |= MAS4_INDD;
+#ifdef CONFIG_PPC_64K_PAGES
+ mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
+ mmu_pte_psize = MMU_PAGE_256M;
+#else
+ mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
+ mmu_pte_psize = MMU_PAGE_1M;
+#endif
+ break;
+
+ case PPC_HTW_NONE:
+#ifdef CONFIG_PPC_64K_PAGES
+ mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
+#else
+ mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
+#endif
+ mmu_pte_psize = mmu_virtual_psize;
+ break;
+ }
+ mtspr(SPRN_MAS4, mas4);
+
+ /* Set the global containing the top of the linear mapping
+ * for use by the TLB miss code
+ */
+ linear_map_top = memblock_end_of_DRAM();
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+ unsigned int num_cams;
+
+ /* use a quarter of the TLBCAM for bolted linear map */
+ num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+ linear_map_top = map_mem_in_cams(linear_map_top, num_cams);
+
+ /* limit memory so we dont have linear faults */
+ memblock_enforce_memory_limit(linear_map_top);
+
+ if (book3e_htw_mode == PPC_HTW_NONE) {
+ extlb_level_exc = EX_TLB_SIZE;
+ patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
+ patch_exception(0x1e0,
+ exc_instruction_tlb_miss_bolted_book3e);
+ }
+ }
+#endif
+
+ /* A sync won't hurt us after mucking around with
+ * the MMU configuration
+ */
+ mb();
+
+ memblock_set_current_limit(linear_map_top);
+}
+
+void __init early_init_mmu(void)
+{
+ __early_init_mmu(1);
+}
+
+void early_init_mmu_secondary(void)
+{
+ __early_init_mmu(0);
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /* On non-FSL Embedded 64-bit, we adjust the RMA size to match
+ * the bolted TLB entry. We know for now that only 1G
+ * entries are supported though that may eventually
+ * change.
+ *
+ * on FSL Embedded 64-bit, we adjust the RMA size to match the
+ * first bolted TLB entry size. We still limit max to 1G even if
+ * the TLB could cover more. This is due to what the early init
+ * code is setup to do.
+ *
+ * We crop it to the size of the first MEMBLOCK to
+ * avoid going over total available memory just in case...
+ */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+ unsigned long linear_sz;
+ linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET,
+ first_memblock_base);
+ ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
+ } else
+#endif
+ ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
+
+ /* Finally limit subsequent allocations */
+ memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
+}
+#else /* ! CONFIG_PPC64 */
+void __init early_init_mmu(void)
+{
+#ifdef CONFIG_PPC_47x
+ early_init_mmu_47x();
+#endif
+}
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
new file mode 100644
index 00000000000..43ff3c797fb
--- /dev/null
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -0,0 +1,426 @@
+/*
+ * This file contains low-level functions for performing various
+ * types of TLB invalidations on various processors with no hash
+ * table.
+ *
+ * This file implements the following functions for all no-hash
+ * processors. Some aren't implemented for some variants. Some
+ * are inline in tlbflush.h
+ *
+ * - tlbil_va
+ * - tlbil_pid
+ * - tlbil_all
+ * - tlbivax_bcast
+ *
+ * Code mostly moved over from misc_32.S
+ *
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Partially rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor.h>
+#include <asm/bug.h>
+
+#if defined(CONFIG_40x)
+
+/*
+ * 40x implementation needs only tlbil_va
+ */
+_GLOBAL(__tlbil_va)
+ /* We run the search with interrupts disabled because we have to change
+ * the PID and I don't want to preempt when that happens.
+ */
+ mfmsr r5
+ mfspr r6,SPRN_PID
+ wrteei 0
+ mtspr SPRN_PID,r4
+ tlbsx. r3, 0, r3
+ mtspr SPRN_PID,r6
+ wrtee r5
+ bne 1f
+ sync
+ /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is
+ * clear. Since 25 is the V bit in the TLB_TAG, loading this value
+ * will invalidate the TLB entry. */
+ tlbwe r3, r3, TLB_TAG
+ isync
+1: blr
+
+#elif defined(CONFIG_8xx)
+
+/*
+ * Nothing to do for 8xx, everything is inline
+ */
+
+#elif defined(CONFIG_44x) /* Includes 47x */
+
+/*
+ * 440 implementation uses tlbsx/we for tlbil_va and a full sweep
+ * of the TLB for everything else.
+ */
+_GLOBAL(__tlbil_va)
+ mfspr r5,SPRN_MMUCR
+ mfmsr r10
+
+ /*
+ * We write 16 bits of STID since 47x supports that much, we
+ * will never be passed out of bounds values on 440 (hopefully)
+ */
+ rlwimi r5,r4,0,16,31
+
+ /* We have to run the search with interrupts disabled, otherwise
+ * an interrupt which causes a TLB miss can clobber the MMUCR
+ * between the mtspr and the tlbsx.
+ *
+ * Critical and Machine Check interrupts take care of saving
+ * and restoring MMUCR, so only normal interrupts have to be
+ * taken care of.
+ */
+ wrteei 0
+ mtspr SPRN_MMUCR,r5
+ tlbsx. r6,0,r3
+ bne 10f
+ sync
+BEGIN_MMU_FTR_SECTION
+ b 2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
+ /* On 440 There are only 64 TLB entries, so r3 < 64, which means bit
+ * 22, is clear. Since 22 is the V bit in the TLB_PAGEID, loading this
+ * value will invalidate the TLB entry.
+ */
+ tlbwe r6,r6,PPC44x_TLB_PAGEID
+ isync
+10: wrtee r10
+ blr
+2:
+#ifdef CONFIG_PPC_47x
+ oris r7,r6,0x8000 /* specify way explicitely */
+ clrrwi r4,r3,12 /* get an EPN for the hashing with V = 0 */
+ ori r4,r4,PPC47x_TLBE_SIZE
+ tlbwe r4,r7,0 /* write it */
+ isync
+ wrtee r10
+ blr
+#else /* CONFIG_PPC_47x */
+1: trap
+ EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
+#endif /* !CONFIG_PPC_47x */
+
+_GLOBAL(_tlbil_all)
+_GLOBAL(_tlbil_pid)
+BEGIN_MMU_FTR_SECTION
+ b 2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
+ li r3,0
+ sync
+
+ /* Load high watermark */
+ lis r4,tlb_44x_hwater@ha
+ lwz r5,tlb_44x_hwater@l(r4)
+
+1: tlbwe r3,r3,PPC44x_TLB_PAGEID
+ addi r3,r3,1
+ cmpw 0,r3,r5
+ ble 1b
+
+ isync
+ blr
+2:
+#ifdef CONFIG_PPC_47x
+ /* 476 variant. There's not simple way to do this, hopefully we'll
+ * try to limit the amount of such full invalidates
+ */
+ mfmsr r11 /* Interrupts off */
+ wrteei 0
+ li r3,-1 /* Current set */
+ lis r10,tlb_47x_boltmap@h
+ ori r10,r10,tlb_47x_boltmap@l
+ lis r7,0x8000 /* Specify way explicitely */
+
+ b 9f /* For each set */
+
+1: li r9,4 /* Number of ways */
+ li r4,0 /* Current way */
+ li r6,0 /* Default entry value 0 */
+ andi. r0,r8,1 /* Check if way 0 is bolted */
+ mtctr r9 /* Load way counter */
+ bne- 3f /* Bolted, skip loading it */
+
+2: /* For each way */
+ or r5,r3,r4 /* Make way|index for tlbre */
+ rlwimi r5,r5,16,8,15 /* Copy index into position */
+ tlbre r6,r5,0 /* Read entry */
+3: addis r4,r4,0x2000 /* Next way */
+ andi. r0,r6,PPC47x_TLB0_VALID /* Valid entry ? */
+ beq 4f /* Nope, skip it */
+ rlwimi r7,r5,0,1,2 /* Insert way number */
+ rlwinm r6,r6,0,21,19 /* Clear V */
+ tlbwe r6,r7,0 /* Write it */
+4: bdnz 2b /* Loop for each way */
+ srwi r8,r8,1 /* Next boltmap bit */
+9: cmpwi cr1,r3,255 /* Last set done ? */
+ addi r3,r3,1 /* Next set */
+ beq cr1,1f /* End of loop */
+ andi. r0,r3,0x1f /* Need to load a new boltmap word ? */
+ bne 1b /* No, loop */
+ lwz r8,0(r10) /* Load boltmap entry */
+ addi r10,r10,4 /* Next word */
+ b 1b /* Then loop */
+1: isync /* Sync shadows */
+ wrtee r11
+#else /* CONFIG_PPC_47x */
+1: trap
+ EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
+#endif /* !CONFIG_PPC_47x */
+ blr
+
+#ifdef CONFIG_PPC_47x
+
+/*
+ * _tlbivax_bcast is only on 47x. We don't bother doing a runtime
+ * check though, it will blow up soon enough if we mistakenly try
+ * to use it on a 440.
+ */
+_GLOBAL(_tlbivax_bcast)
+ mfspr r5,SPRN_MMUCR
+ mfmsr r10
+ rlwimi r5,r4,0,16,31
+ wrteei 0
+ mtspr SPRN_MMUCR,r5
+ isync
+ PPC_TLBIVAX(0, R3)
+ isync
+ eieio
+ tlbsync
+BEGIN_FTR_SECTION
+ b 1f
+END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
+ sync
+ wrtee r10
+ blr
+/*
+ * DD2 HW could hang if in instruction fetch happens before msync completes.
+ * Touch enough instruction cache lines to ensure cache hits
+ */
+1: mflr r9
+ bl 2f
+2: mflr r6
+ li r7,32
+ PPC_ICBT(0,R6,R7) /* touch next cache line */
+ add r6,r6,r7
+ PPC_ICBT(0,R6,R7) /* touch next cache line */
+ add r6,r6,r7
+ PPC_ICBT(0,R6,R7) /* touch next cache line */
+ sync
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ mtlr r9
+ wrtee r10
+ blr
+#endif /* CONFIG_PPC_47x */
+
+#elif defined(CONFIG_FSL_BOOKE)
+/*
+ * FSL BookE implementations.
+ *
+ * Since feature sections are using _SECTION_ELSE we need
+ * to have the larger code path before the _SECTION_ELSE
+ */
+
+/*
+ * Flush MMU TLB on the local processor
+ */
+_GLOBAL(_tlbil_all)
+BEGIN_MMU_FTR_SECTION
+ li r3,(MMUCSR0_TLBFI)@l
+ mtspr SPRN_MMUCSR0, r3
+1:
+ mfspr r3,SPRN_MMUCSR0
+ andi. r3,r3,MMUCSR0_TLBFI@l
+ bne 1b
+MMU_FTR_SECTION_ELSE
+ PPC_TLBILX_ALL(0,R0)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
+ msync
+ isync
+ blr
+
+_GLOBAL(_tlbil_pid)
+BEGIN_MMU_FTR_SECTION
+ slwi r3,r3,16
+ mfmsr r10
+ wrteei 0
+ mfspr r4,SPRN_MAS6 /* save MAS6 */
+ mtspr SPRN_MAS6,r3
+ PPC_TLBILX_PID(0,R0)
+ mtspr SPRN_MAS6,r4 /* restore MAS6 */
+ wrtee r10
+MMU_FTR_SECTION_ELSE
+ li r3,(MMUCSR0_TLBFI)@l
+ mtspr SPRN_MMUCSR0, r3
+1:
+ mfspr r3,SPRN_MMUCSR0
+ andi. r3,r3,MMUCSR0_TLBFI@l
+ bne 1b
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX)
+ msync
+ isync
+ blr
+
+/*
+ * Flush MMU TLB for a particular address, but only on the local processor
+ * (no broadcast)
+ */
+_GLOBAL(__tlbil_va)
+ mfmsr r10
+ wrteei 0
+ slwi r4,r4,16
+ ori r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l
+ mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
+BEGIN_MMU_FTR_SECTION
+ tlbsx 0,r3
+ mfspr r4,SPRN_MAS1 /* check valid */
+ andis. r3,r4,MAS1_VALID@h
+ beq 1f
+ rlwinm r4,r4,0,1,31
+ mtspr SPRN_MAS1,r4
+ tlbwe
+MMU_FTR_SECTION_ELSE
+ PPC_TLBILX_VA(0,R3)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
+ msync
+ isync
+1: wrtee r10
+ blr
+#elif defined(CONFIG_PPC_BOOK3E)
+/*
+ * New Book3E (>= 2.06) implementation
+ *
+ * Note: We may be able to get away without the interrupt masking stuff
+ * if we save/restore MAS6 on exceptions that might modify it
+ */
+_GLOBAL(_tlbil_pid)
+ slwi r4,r3,MAS6_SPID_SHIFT
+ mfmsr r10
+ wrteei 0
+ mtspr SPRN_MAS6,r4
+ PPC_TLBILX_PID(0,R0)
+ wrtee r10
+ msync
+ isync
+ blr
+
+_GLOBAL(_tlbil_pid_noind)
+ slwi r4,r3,MAS6_SPID_SHIFT
+ mfmsr r10
+ ori r4,r4,MAS6_SIND
+ wrteei 0
+ mtspr SPRN_MAS6,r4
+ PPC_TLBILX_PID(0,R0)
+ wrtee r10
+ msync
+ isync
+ blr
+
+_GLOBAL(_tlbil_all)
+ PPC_TLBILX_ALL(0,R0)
+ msync
+ isync
+ blr
+
+_GLOBAL(_tlbil_va)
+ mfmsr r10
+ wrteei 0
+ cmpwi cr0,r6,0
+ slwi r4,r4,MAS6_SPID_SHIFT
+ rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+ beq 1f
+ rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
+ PPC_TLBILX_VA(0,R3)
+ msync
+ isync
+ wrtee r10
+ blr
+
+_GLOBAL(_tlbivax_bcast)
+ mfmsr r10
+ wrteei 0
+ cmpwi cr0,r6,0
+ slwi r4,r4,MAS6_SPID_SHIFT
+ rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+ beq 1f
+ rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
+ PPC_TLBIVAX(0,R3)
+ eieio
+ tlbsync
+ sync
+ wrtee r10
+ blr
+
+_GLOBAL(set_context)
+#ifdef CONFIG_BDI_SWITCH
+ /* Context switch the PTE pointer for the Abatron BDI2000.
+ * The PGDIR is the second parameter.
+ */
+ lis r5, abatron_pteptrs@h
+ ori r5, r5, abatron_pteptrs@l
+ stw r4, 0x4(r5)
+#endif
+ mtspr SPRN_PID,r3
+ isync /* Force context change */
+ blr
+#else
+#error Unsupported processor type !
+#endif
+
+#if defined(CONFIG_PPC_FSL_BOOK3E)
+/*
+ * extern void loadcam_entry(unsigned int index)
+ *
+ * Load TLBCAM[index] entry in to the L2 CAM MMU
+ */
+_GLOBAL(loadcam_entry)
+ mflr r5
+ LOAD_REG_ADDR_PIC(r4, TLBCAM)
+ mtlr r5
+ mulli r5,r3,TLBCAM_SIZE
+ add r3,r5,r4
+ lwz r4,TLBCAM_MAS0(r3)
+ mtspr SPRN_MAS0,r4
+ lwz r4,TLBCAM_MAS1(r3)
+ mtspr SPRN_MAS1,r4
+ PPC_LL r4,TLBCAM_MAS2(r3)
+ mtspr SPRN_MAS2,r4
+ lwz r4,TLBCAM_MAS3(r3)
+ mtspr SPRN_MAS3,r4
+BEGIN_MMU_FTR_SECTION
+ lwz r4,TLBCAM_MAS7(r3)
+ mtspr SPRN_MAS7,r4
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
+ isync
+ tlbwe
+ isync
+ blr
+#endif