aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/44x_mmu.c7
-rw-r--r--arch/powerpc/mm/Makefile10
-rw-r--r--arch/powerpc/mm/dma-noncoherent.c9
-rw-r--r--arch/powerpc/mm/fault.c235
-rw-r--r--arch/powerpc/mm/fsl_booke_mmu.c99
-rw-r--r--arch/powerpc/mm/gup.c87
-rw-r--r--arch/powerpc/mm/hash_low_32.S8
-rw-r--r--arch/powerpc/mm/hash_low_64.S393
-rw-r--r--arch/powerpc/mm/hash_native_64.c441
-rw-r--r--arch/powerpc/mm/hash_utils_64.c539
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c179
-rw-r--r--arch/powerpc/mm/hugetlbpage-book3e.c57
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c50
-rw-r--r--arch/powerpc/mm/hugetlbpage.c445
-rw-r--r--arch/powerpc/mm/icswx.c25
-rw-r--r--arch/powerpc/mm/icswx.h6
-rw-r--r--arch/powerpc/mm/init_32.c8
-rw-r--r--arch/powerpc/mm/init_64.c86
-rw-r--r--arch/powerpc/mm/mem.c152
-rw-r--r--arch/powerpc/mm/mmap.c (renamed from arch/powerpc/mm/mmap_64.c)2
-rw-r--r--arch/powerpc/mm/mmu_context_hash64.c46
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c38
-rw-r--r--arch/powerpc/mm/mmu_decl.h2
-rw-r--r--arch/powerpc/mm/numa.c575
-rw-r--r--arch/powerpc/mm/pgtable.c30
-rw-r--r--arch/powerpc/mm/pgtable_32.c9
-rw-r--r--arch/powerpc/mm/pgtable_64.c567
-rw-r--r--arch/powerpc/mm/slb.c29
-rw-r--r--arch/powerpc/mm/slb_low.S124
-rw-r--r--arch/powerpc/mm/slice.c339
-rw-r--r--arch/powerpc/mm/stab.c12
-rw-r--r--arch/powerpc/mm/subpage-prot.c60
-rw-r--r--arch/powerpc/mm/tlb_hash64.c56
-rw-r--r--arch/powerpc/mm/tlb_low_64e.S251
-rw-r--r--arch/powerpc/mm/tlb_nohash.c150
-rw-r--r--arch/powerpc/mm/tlb_nohash_low.S35
36 files changed, 3686 insertions, 1475 deletions
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 388b95e1a00..82b1ff759e2 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -27,7 +27,6 @@
#include <linux/memblock.h>
#include <asm/mmu.h>
-#include <asm/system.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
@@ -42,7 +41,7 @@ int icache_44x_need_flush;
unsigned long tlb_47x_boltmap[1024/8];
-static void __cpuinit ppc44x_update_tlb_hwater(void)
+static void ppc44x_update_tlb_hwater(void)
{
extern unsigned int tlb_44x_patch_hwater_D[];
extern unsigned int tlb_44x_patch_hwater_I[];
@@ -135,7 +134,7 @@ static void __init ppc47x_update_boltmap(void)
/*
* "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
*/
-static void __cpuinit ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
{
unsigned int rA;
int bolted;
@@ -230,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
}
#ifdef CONFIG_SMP
-void __cpuinit mmu_init_secondary(int cpu)
+void mmu_init_secondary(int cpu)
{
unsigned long addr;
unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3787b61f7d2..51230ee6a40 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -4,19 +4,18 @@
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
-ccflags-$(CONFIG_PPC64) := -mno-minimal-toc
+ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-obj-y := fault.o mem.o pgtable.o gup.o \
+obj-y := fault.o mem.o pgtable.o gup.o mmap.o \
init_$(CONFIG_WORD_SIZE).o \
pgtable_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
tlb_nohash_low.o
obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
-obj-$(CONFIG_PPC64) += mmap_64.o
hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \
slb_low.o slb.o stab.o \
- mmap_64.o $(hash64-y)
+ $(hash64-y)
obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o
obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \
tlb_hash$(CONFIG_WORD_SIZE).o \
@@ -28,11 +27,12 @@ obj-$(CONFIG_44x) += 44x_mmu.o
obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
obj-$(CONFIG_PPC_MM_SLICES) += slice.o
-ifeq ($(CONFIG_HUGETLB_PAGE),y)
obj-y += hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 329be36c0a8..7b6c1075017 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -287,9 +287,7 @@ void __dma_free_coherent(size_t size, void *vaddr)
pte_clear(&init_mm, addr, ptep);
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
-
- ClearPageReserved(page);
- __free_page(page);
+ __free_reserved_page(page);
}
}
addr += PAGE_SIZE;
@@ -365,12 +363,11 @@ static inline void __dma_sync_page_highmem(struct page *page,
local_irq_save(flags);
do {
- start = (unsigned long)kmap_atomic(page + seg_nr,
- KM_PPC_SYNC_PAGE) + seg_offset;
+ start = (unsigned long)kmap_atomic(page + seg_nr) + seg_offset;
/* Sync this buffer segment */
__dma_sync((void *)start, seg_size, direction);
- kunmap_atomic((void *)start, KM_PPC_SYNC_PAGE);
+ kunmap_atomic((void *)start);
seg_nr++;
/* Calculate next buffer segment size */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 2f0d1b032a8..51ab9e7e6c3 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -32,16 +32,17 @@
#include <linux/perf_event.h>
#include <linux/magic.h>
#include <linux/ratelimit.h>
+#include <linux/context_tracking.h>
#include <asm/firmware.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/siginfo.h>
+#include <asm/debug.h>
#include <mm/mmu_decl.h>
#include "icswx.h"
@@ -105,6 +106,80 @@ static int store_updates_sp(struct pt_regs *regs)
}
return 0;
}
+/*
+ * do_page_fault error handling helpers
+ */
+
+#define MM_FAULT_RETURN 0
+#define MM_FAULT_CONTINUE -1
+#define MM_FAULT_ERR(sig) (sig)
+
+static int do_sigbus(struct pt_regs *regs, unsigned long address)
+{
+ siginfo_t info;
+
+ up_read(&current->mm->mmap_sem);
+
+ if (user_mode(regs)) {
+ current->thread.trap_nr = BUS_ADRERR;
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_ADRERR;
+ info.si_addr = (void __user *)address;
+ force_sig_info(SIGBUS, &info, current);
+ return MM_FAULT_RETURN;
+ }
+ return MM_FAULT_ERR(SIGBUS);
+}
+
+static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
+{
+ /*
+ * Pagefault was interrupted by SIGKILL. We have no reason to
+ * continue the pagefault.
+ */
+ if (fatal_signal_pending(current)) {
+ /*
+ * If we have retry set, the mmap semaphore will have
+ * alrady been released in __lock_page_or_retry(). Else
+ * we release it now.
+ */
+ if (!(fault & VM_FAULT_RETRY))
+ up_read(&current->mm->mmap_sem);
+ /* Coming from kernel, we need to deal with uaccess fixups */
+ if (user_mode(regs))
+ return MM_FAULT_RETURN;
+ return MM_FAULT_ERR(SIGKILL);
+ }
+
+ /* No fault: be happy */
+ if (!(fault & VM_FAULT_ERROR))
+ return MM_FAULT_CONTINUE;
+
+ /* Out of memory */
+ if (fault & VM_FAULT_OOM) {
+ up_read(&current->mm->mmap_sem);
+
+ /*
+ * We ran out of memory, or some other thing happened to us that
+ * made us unable to handle the page fault gracefully.
+ */
+ if (!user_mode(regs))
+ return MM_FAULT_ERR(SIGKILL);
+ pagefault_out_of_memory();
+ return MM_FAULT_RETURN;
+ }
+
+ /* Bus error. x86 handles HWPOISON here, we'll add this if/when
+ * we support the feature in HW
+ */
+ if (fault & VM_FAULT_SIGBUS)
+ return do_sigbus(regs, addr);
+
+ /* We don't understand the fault code, this is fatal */
+ BUG();
+ return MM_FAULT_CONTINUE;
+}
/*
* For 600- and 800-family processors, the error_code parameter is DSISR
@@ -122,13 +197,16 @@ static int store_updates_sp(struct pt_regs *regs)
int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
+ enum ctx_state prev_state = exception_enter();
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
- siginfo_t info;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
int code = SEGV_MAPERR;
- int is_write = 0, ret;
+ int is_write = 0;
int trap = TRAP(regs);
int is_exec = trap == 0x400;
+ int fault;
+ int rc = 0, store_update_sp = 0;
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
/*
@@ -152,36 +230,42 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
* look at it
*/
if (error_code & ICSWX_DSI_UCT) {
- int ret;
-
- ret = acop_handle_fault(regs, address, error_code);
- if (ret)
- return ret;
+ rc = acop_handle_fault(regs, address, error_code);
+ if (rc)
+ goto bail;
}
-#endif
+#endif /* CONFIG_PPC_ICSWX */
if (notify_page_fault(regs))
- return 0;
+ goto bail;
if (unlikely(debugger_fault_handler(regs)))
- return 0;
+ goto bail;
/* On a kernel SLB miss we can only check for a valid exception entry */
- if (!user_mode(regs) && (address >= TASK_SIZE))
- return SIGSEGV;
+ if (!user_mode(regs) && (address >= TASK_SIZE)) {
+ rc = SIGSEGV;
+ goto bail;
+ }
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
defined(CONFIG_PPC_BOOK3S_64))
if (error_code & DSISR_DABRMATCH) {
- /* DABR match */
- do_dabr(regs, address, error_code);
- return 0;
+ /* breakpoint match */
+ do_break(regs, address, error_code);
+ goto bail;
}
#endif
+ /* We restore the interrupt state now */
+ if (!arch_irq_disabled_regs(regs))
+ local_irq_enable();
+
if (in_atomic() || mm == NULL) {
- if (!user_mode(regs))
- return SIGSEGV;
+ if (!user_mode(regs)) {
+ rc = SIGSEGV;
+ goto bail;
+ }
/* in_atomic() in user mode is really bad,
as is current->mm == NULL. */
printk(KERN_EMERG "Page fault in user mode with "
@@ -193,6 +277,17 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ /*
+ * We want to do this outside mmap_sem, because reading code around nip
+ * can result in fault, which will cause a deadlock when called with
+ * mmap_sem held
+ */
+ if (user_mode(regs))
+ store_update_sp = store_updates_sp(regs);
+
+ if (user_mode(regs))
+ flags |= FAULT_FLAG_USER;
+
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -212,7 +307,15 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
if (!user_mode(regs) && !search_exception_tables(regs->nip))
goto bad_area_nosemaphore;
+retry:
down_read(&mm->mmap_sem);
+ } else {
+ /*
+ * The above down_read_trylock() might have succeeded in
+ * which case we'll have missed the might_sleep() from
+ * down_read():
+ */
+ might_sleep();
}
vma = find_vma(mm, address);
@@ -250,8 +353,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
- if (address + 2048 < uregs->gpr[1]
- && (!user_mode(regs) || !store_updates_sp(regs)))
+ if (address + 2048 < uregs->gpr[1] && !store_update_sp)
goto bad_area;
}
if (expand_stack(vma, address))
@@ -313,6 +415,7 @@ good_area:
} else if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
+ flags |= FAULT_FLAG_WRITE;
/* a read */
} else {
/* protection fault */
@@ -327,32 +430,52 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
- if (unlikely(ret & VM_FAULT_ERROR)) {
- if (ret & VM_FAULT_OOM)
- goto out_of_memory;
- else if (ret & VM_FAULT_SIGBUS)
- goto do_sigbus;
- BUG();
+ fault = handle_mm_fault(mm, vma, address, flags);
+ if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+ rc = mm_fault_error(regs, address, fault);
+ if (rc >= MM_FAULT_RETURN)
+ goto bail;
+ else
+ rc = 0;
}
- if (ret & VM_FAULT_MAJOR) {
- current->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
+
+ /*
+ * Major/minor page fault accounting is only done on the
+ * initial attempt. If we go through a retry, it is extremely
+ * likely that the page will be found in page cache at that point.
+ */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ if (fault & VM_FAULT_MAJOR) {
+ current->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+ regs, address);
#ifdef CONFIG_PPC_SMLPAR
- if (firmware_has_feature(FW_FEATURE_CMO)) {
- preempt_disable();
- get_lppaca()->page_ins += (1 << PAGE_FACTOR);
- preempt_enable();
+ if (firmware_has_feature(FW_FEATURE_CMO)) {
+ u32 page_ins;
+
+ preempt_disable();
+ page_ins = be32_to_cpu(get_lppaca()->page_ins);
+ page_ins += 1 << PAGE_FACTOR;
+ get_lppaca()->page_ins = cpu_to_be32(page_ins);
+ preempt_enable();
+ }
+#endif /* CONFIG_PPC_SMLPAR */
+ } else {
+ current->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+ regs, address);
+ }
+ if (fault & VM_FAULT_RETRY) {
+ /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+ * of starvation. */
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+ goto retry;
}
-#endif
- } else {
- current->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
}
+
up_read(&mm->mmap_sem);
- return 0;
+ goto bail;
bad_area:
up_read(&mm->mmap_sem);
@@ -361,38 +484,20 @@ bad_area_nosemaphore:
/* User mode accesses cause a SIGSEGV */
if (user_mode(regs)) {
_exception(SIGSEGV, regs, code, address);
- return 0;
+ goto bail;
}
if (is_exec && (error_code & DSISR_PROTFAULT))
printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
" page (%lx) - exploit attempt? (uid: %d)\n",
- address, current_uid());
+ address, from_kuid(&init_user_ns, current_uid()));
- return SIGSEGV;
+ rc = SIGSEGV;
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
- up_read(&mm->mmap_sem);
- if (!user_mode(regs))
- return SIGKILL;
- pagefault_out_of_memory();
- return 0;
+bail:
+ exception_exit(prev_state);
+ return rc;
-do_sigbus:
- up_read(&mm->mmap_sem);
- if (user_mode(regs)) {
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGBUS, &info, current);
- return 0;
- }
- return SIGBUS;
}
/*
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index 66a6fd38e9c..94cd728166d 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -52,6 +52,7 @@
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/setup.h>
+#include <asm/paca.h>
#include "mmu_decl.h"
@@ -149,12 +150,19 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
phys_addr_t phys)
{
- unsigned int camsize = __ilog2(ram) & ~1U;
- unsigned int align = __ffs(virt | phys) & ~1U;
- unsigned long max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xf;
-
- /* Convert (4^max) kB to (2^max) bytes */
- max_cam = max_cam * 2 + 10;
+ unsigned int camsize = __ilog2(ram);
+ unsigned int align = __ffs(virt | phys);
+ unsigned long max_cam;
+
+ if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+ /* Convert (4^max) kB to (2^max) bytes */
+ max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10;
+ camsize &= ~1U;
+ align &= ~1U;
+ } else {
+ /* Convert (2^max) kB to (2^max) bytes */
+ max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10;
+ }
if (camsize > align)
camsize = align;
@@ -164,11 +172,10 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
return 1UL << camsize;
}
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
+ unsigned long ram, int max_cam_idx)
{
int i;
- unsigned long virt = PAGE_OFFSET;
- phys_addr_t phys = memstart_addr;
unsigned long amount_mapped = 0;
/* Calculate CAM values */
@@ -185,9 +192,23 @@ unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
}
tlbcam_index = i;
+#ifdef CONFIG_PPC64
+ get_paca()->tcd.esel_next = i;
+ get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+ get_paca()->tcd.esel_first = i;
+#endif
+
return amount_mapped;
}
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+{
+ unsigned long virt = PAGE_OFFSET;
+ phys_addr_t phys = memstart_addr;
+
+ return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx);
+}
+
#ifdef CONFIG_PPC32
#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
@@ -215,7 +236,9 @@ void __init adjust_total_lowmem(void)
/* adjust lowmem size to __max_low_memory */
ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
+ i = switch_to_as1();
__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
+ restore_to_as0(i, 0, 0, 1);
pr_info("Memory CAM mapping: ");
for (i = 0; i < tlbcam_index - 1; i++)
@@ -234,4 +257,62 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
/* 64M mapped initially according to head_fsl_booke.S */
memblock_set_current_limit(min_t(u64, limit, 0x04000000));
}
+
+#ifdef CONFIG_RELOCATABLE
+int __initdata is_second_reloc;
+notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
+{
+ unsigned long base = KERNELBASE;
+
+ kernstart_addr = start;
+ if (is_second_reloc) {
+ virt_phys_offset = PAGE_OFFSET - memstart_addr;
+ return;
+ }
+
+ /*
+ * Relocatable kernel support based on processing of dynamic
+ * relocation entries. Before we get the real memstart_addr,
+ * We will compute the virt_phys_offset like this:
+ * virt_phys_offset = stext.run - kernstart_addr
+ *
+ * stext.run = (KERNELBASE & ~0x3ffffff) +
+ * (kernstart_addr & 0x3ffffff)
+ * When we relocate, we have :
+ *
+ * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff)
+ *
+ * hence:
+ * virt_phys_offset = (KERNELBASE & ~0x3ffffff) -
+ * (kernstart_addr & ~0x3ffffff)
+ *
+ */
+ start &= ~0x3ffffff;
+ base &= ~0x3ffffff;
+ virt_phys_offset = base - start;
+ early_get_first_memblock_info(__va(dt_ptr), NULL);
+ /*
+ * We now get the memstart_addr, then we should check if this
+ * address is the same as what the PAGE_OFFSET map to now. If
+ * not we have to change the map of PAGE_OFFSET to memstart_addr
+ * and do a second relocation.
+ */
+ if (start != memstart_addr) {
+ int n;
+ long offset = start - memstart_addr;
+
+ is_second_reloc = 1;
+ n = switch_to_as1();
+ /* map a 64M area for the second relocation */
+ if (memstart_addr > start)
+ map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM);
+ else
+ map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
+ 0x4000000, CONFIG_LOWMEM_CAM_NUM);
+ restore_to_as0(n, offset, __va(dt_ptr), 1);
+ /* We should never reach here */
+ panic("Relocation error");
+ }
+}
+#endif
#endif
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d7efdbf640c..d8746684f60 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -34,8 +34,13 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
ptep = pte_offset_kernel(&pmd, addr);
do {
- pte_t pte = *ptep;
+ pte_t pte = ACCESS_ONCE(*ptep);
struct page *page;
+ /*
+ * Similar to the PMD case, NUMA hinting must take slow path
+ */
+ if (pte_numa(pte))
+ return 0;
if ((pte_val(pte) & mask) != result)
return 0;
@@ -63,12 +68,30 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmdp = pmd_offset(&pud, addr);
do {
- pmd_t pmd = *pmdp;
+ pmd_t pmd = ACCESS_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
+ /*
+ * If we find a splitting transparent hugepage we
+ * return zero. That will result in taking the slow
+ * path which will call wait_split_huge_page()
+ * if the pmd is still in splitting state
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
- if (is_hugepd(pmdp)) {
+ if (pmd_huge(pmd) || pmd_large(pmd)) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_numa(pmd))
+ return 0;
+
+ if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (is_hugepd(pmdp)) {
if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
addr, next, write, pages, nr))
return 0;
@@ -87,12 +110,16 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
pudp = pud_offset(&pgd, addr);
do {
- pud_t pud = *pudp;
+ pud_t pud = ACCESS_ONCE(*pudp);
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
- if (is_hugepd(pudp)) {
+ if (pud_huge(pud)) {
+ if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (is_hugepd(pudp)) {
if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
addr, next, write, pages, nr))
return 0;
@@ -103,12 +130,13 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
unsigned long next;
+ unsigned long flags;
pgd_t *pgdp;
int nr = 0;
@@ -121,7 +149,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
start, len)))
- goto slow_irqon;
+ return 0;
pr_devel(" aligned: %lx .. %lx\n", start, end);
@@ -142,36 +170,45 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
* So long as we atomically load page table pointers versus teardown,
* we can follow the address down to the the page and take a ref on it.
*/
- local_irq_disable();
+ local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
- pgd_t pgd = *pgdp;
+ pgd_t pgd = ACCESS_ONCE(*pgdp);
pr_devel(" %016lx: normal pgd %p\n", addr,
(void *)pgd_val(pgd));
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
- goto slow;
- if (is_hugepd(pgdp)) {
+ break;
+ if (pgd_huge(pgd)) {
+ if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
+ write, pages, &nr))
+ break;
+ } else if (is_hugepd(pgdp)) {
if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
addr, next, write, pages, &nr))
- goto slow;
+ break;
} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
- goto slow;
+ break;
} while (pgdp++, addr = next, addr != end);
- local_irq_enable();
+ local_irq_restore(flags);
- VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
return nr;
+}
- {
- int ret;
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ int nr, ret;
-slow:
- local_irq_enable();
-slow_irqon:
+ start &= PAGE_MASK;
+ nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ ret = nr;
+
+ if (nr < nr_pages) {
pr_devel(" slow path ! nr = %d\n", nr);
/* Try to get the remaining pages with get_user_pages */
@@ -180,7 +217,7 @@ slow_irqon:
down_read(&mm->mmap_sem);
ret = get_user_pages(current, mm, start,
- (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+ nr_pages - nr, write, 0, pages, NULL);
up_read(&mm->mmap_sem);
/* Have to be a bit careful with return values */
@@ -190,9 +227,9 @@ slow_irqon:
else
ret += nr;
}
-
- return ret;
}
+
+ return ret;
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index b13d58932bf..115347f74ce 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -184,7 +184,7 @@ _GLOBAL(add_hash_page)
add r3,r3,r0 /* note create_hpte trims to 24 bits */
#ifdef CONFIG_SMP
- rlwinm r8,r1,0,0,(31-THREAD_SHIFT) /* use cpu number to make tag */
+ CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */
lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */
oris r8,r8,12
#endif /* CONFIG_SMP */
@@ -545,7 +545,7 @@ _GLOBAL(flush_hash_pages)
#ifdef CONFIG_SMP
addis r9,r7,mmu_hash_lock@ha
addi r9,r9,mmu_hash_lock@l
- rlwinm r8,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r8, r1)
add r8,r8,r7
lwz r8,TI_CPU(r8)
oris r8,r8,9
@@ -639,7 +639,7 @@ _GLOBAL(flush_hash_patch_B)
*/
_GLOBAL(_tlbie)
#ifdef CONFIG_SMP
- rlwinm r8,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r8, r1)
lwz r8,TI_CPU(r8)
oris r8,r8,11
mfmsr r10
@@ -677,7 +677,7 @@ _GLOBAL(_tlbie)
*/
_GLOBAL(_tlbia)
#if defined(CONFIG_SMP)
- rlwinm r8,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r8, r1)
lwz r8,TI_CPU(r8)
oris r8,r8,10
mfmsr r10
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index a242b5d7cbe..057cbbb4c57 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -34,14 +34,6 @@
* | CR save area (SP + 8)
* SP ---> +-- Back chain (SP + 0)
*/
-#define STACKFRAMESIZE 256
-
-/* Save parameters offsets */
-#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8)
-
-/* Save non-volatile offsets */
-#define STK_REG(i) (112 + ((i)-14)*8)
-
#ifndef CONFIG_PPC_64K_PAGES
@@ -64,22 +56,22 @@ _GLOBAL(__hash_page_4K)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
/* Save all params that we need after a function call */
- std r6,STK_PARM(r6)(r1)
- std r8,STK_PARM(r8)(r1)
- std r9,STK_PARM(r9)(r1)
+ std r6,STK_PARAM(R6)(r1)
+ std r8,STK_PARAM(R8)(r1)
+ std r9,STK_PARAM(R9)(r1)
/* Save non-volatile registers.
* r31 will hold "old PTE"
* r30 is "new PTE"
- * r29 is "va"
+ * r29 is vpn
* r28 is a hash value
* r27 is hashtab mask (maybe dynamic patched instead ?)
*/
- std r27,STK_REG(r27)(r1)
- std r28,STK_REG(r28)(r1)
- std r29,STK_REG(r29)(r1)
- std r30,STK_REG(r30)(r1)
- std r31,STK_REG(r31)(r1)
+ std r27,STK_REG(R27)(r1)
+ std r28,STK_REG(R28)(r1)
+ std r29,STK_REG(R29)(r1)
+ std r30,STK_REG(R30)(r1)
+ std r31,STK_REG(R31)(r1)
/* Step 1:
*
@@ -119,25 +111,33 @@ BEGIN_FTR_SECTION
cmpdi r9,0 /* check segment size */
bne 3f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- /* Calc va and put it in r29 */
- rldicr r29,r5,28,63-28
- rldicl r3,r3,0,36
- or r29,r3,r29
-
- /* Calculate hash value for primary slot and store it in r28 */
- rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
- rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
- xor r28,r5,r0
+ /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT - VPN_SHIFT
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
+ or r29,r28,r29
+ /*
+ * Calculate hash value for primary slot and store it in r28
+ * r3 = va, r5 = vsid
+ * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
+ */
+ rldicl r0,r3,64-12,48
+ xor r28,r5,r0 /* hash */
b 4f
-3: /* Calc VA and hash in r29 and r28 for 1T segment */
- sldi r29,r5,40 /* vsid << 40 */
- clrldi r3,r3,24 /* ea & 0xffffffffff */
- rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */
- clrldi r5,r5,40 /* vsid & 0xffffff */
- rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */
- xor r28,r28,r5
- or r29,r3,r29 /* VA */
+3: /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
+ or r29,r28,r29
+
+ /*
+ * calculate hash value for primary slot and
+ * store it in r28 for 1T segment
+ * r3 = va, r5 = vsid
+ */
+ sldi r28,r5,25 /* vsid << 25 */
+ /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
+ rldicl r0,r3,64-12,36
+ xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
xor r28,r28,r0 /* hash */
/* Convert linux PTE bits into HW equivalents */
@@ -148,7 +148,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
andc r0,r30,r0 /* r0 = pte & ~r0 */
rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ ori r3,r3,HPTE_R_C | HPTE_R_M
/* We eventually do the icache sync here (maybe inline that
* code rather than call a C function...)
@@ -156,13 +159,13 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
BEGIN_FTR_SECTION
mr r4,r30
mr r5,r7
- bl .hash_page_do_lazy_icache
+ bl hash_page_do_lazy_icache
END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* At this point, r3 contains new PP bits, save them in
* place of "access" in the param area (sic)
*/
- std r3,STK_PARM(r4)(r1)
+ std r3,STK_PARAM(R4)(r1)
/* Get htab_hash_mask */
ld r4,htab_hash_mask@got(2)
@@ -192,12 +195,14 @@ htab_insert_pte:
rldicr r3,r0,3,63-3 /* r3 = (hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
bl . /* Patched by htab_finish_init() */
cmpdi 0,r3,0
bge htab_pte_insert_ok /* Insertion successful */
@@ -215,12 +220,14 @@ _GLOBAL(htab_call_hpte_insert1)
rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
bl . /* Patched by htab_finish_init() */
cmpdi 0,r3,0
bge+ htab_pte_insert_ok /* Insertion successful */
@@ -237,7 +244,8 @@ _GLOBAL(htab_call_hpte_insert2)
2: and r0,r5,r27
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
bl . /* Patched by htab_finish_init() */
/* Try all again */
@@ -255,15 +263,15 @@ htab_pte_insert_ok:
* (maybe add eieio may be good still ?)
*/
htab_write_out_pte:
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r30,0(r6)
li r3, 0
htab_bail:
- ld r27,STK_REG(r27)(r1)
- ld r28,STK_REG(r28)(r1)
- ld r29,STK_REG(r29)(r1)
- ld r30,STK_REG(r30)(r1)
- ld r31,STK_REG(r31)(r1)
+ ld r27,STK_REG(R27)(r1)
+ ld r28,STK_REG(R28)(r1)
+ ld r29,STK_REG(R29)(r1)
+ ld r30,STK_REG(R30)(r1)
+ ld r31,STK_REG(R31)(r1)
addi r1,r1,STACKFRAMESIZE
ld r0,16(r1)
mtlr r0
@@ -286,11 +294,13 @@ htab_modify_pte:
add r3,r0,r3 /* add slot idx */
/* Call ppc_md.hpte_updatepp */
- mr r5,r29 /* va */
- li r6,MMU_PAGE_4K /* page size */
- ld r7,STK_PARM(r9)(r1) /* segment size */
- ld r8,STK_PARM(r8)(r1) /* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+ mr r5,r29 /* vpn */
+ li r6,MMU_PAGE_4K /* base page size */
+ li r7,MMU_PAGE_4K /* actual page size */
+ ld r8,STK_PARAM(R9)(r1) /* segment size */
+ ld r9,STK_PARAM(R8)(r1) /* get "local" param */
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
bl . /* Patched by htab_finish_init() */
/* if we failed because typically the HPTE wasn't really here
@@ -312,7 +322,7 @@ htab_wrong_access:
htab_pte_insert_failure:
/* Bail out restoring old PTE */
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r31,0(r6)
li r3,-1
b htab_bail
@@ -340,26 +350,26 @@ _GLOBAL(__hash_page_4K)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
/* Save all params that we need after a function call */
- std r6,STK_PARM(r6)(r1)
- std r8,STK_PARM(r8)(r1)
- std r9,STK_PARM(r9)(r1)
+ std r6,STK_PARAM(R6)(r1)
+ std r8,STK_PARAM(R8)(r1)
+ std r9,STK_PARAM(R9)(r1)
/* Save non-volatile registers.
* r31 will hold "old PTE"
* r30 is "new PTE"
- * r29 is "va"
+ * r29 is vpn
* r28 is a hash value
* r27 is hashtab mask (maybe dynamic patched instead ?)
* r26 is the hidx mask
* r25 is the index in combo page
*/
- std r25,STK_REG(r25)(r1)
- std r26,STK_REG(r26)(r1)
- std r27,STK_REG(r27)(r1)
- std r28,STK_REG(r28)(r1)
- std r29,STK_REG(r29)(r1)
- std r30,STK_REG(r30)(r1)
- std r31,STK_REG(r31)(r1)
+ std r25,STK_REG(R25)(r1)
+ std r26,STK_REG(R26)(r1)
+ std r27,STK_REG(R27)(r1)
+ std r28,STK_REG(R28)(r1)
+ std r29,STK_REG(R29)(r1)
+ std r30,STK_REG(R30)(r1)
+ std r31,STK_REG(R31)(r1)
/* Step 1:
*
@@ -402,25 +412,41 @@ BEGIN_FTR_SECTION
cmpdi r9,0 /* check segment size */
bne 3f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- /* Calc va and put it in r29 */
- rldicr r29,r5,28,63-28 /* r29 = (vsid << 28) */
- rldicl r3,r3,0,36 /* r3 = (ea & 0x0fffffff) */
- or r29,r3,r29 /* r29 = va */
-
- /* Calculate hash value for primary slot and store it in r28 */
- rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
- rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
- xor r28,r5,r0
+ /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT - VPN_SHIFT
+ /*
+ * clrldi r3,r3,64 - SID_SHIFT --> ea & 0xfffffff
+ * srdi r28,r3,VPN_SHIFT
+ */
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
+ or r29,r28,r29
+ /*
+ * Calculate hash value for primary slot and store it in r28
+ * r3 = va, r5 = vsid
+ * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
+ */
+ rldicl r0,r3,64-12,48
+ xor r28,r5,r0 /* hash */
b 4f
-3: /* Calc VA and hash in r29 and r28 for 1T segment */
- sldi r29,r5,40 /* vsid << 40 */
- clrldi r3,r3,24 /* ea & 0xffffffffff */
- rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */
- clrldi r5,r5,40 /* vsid & 0xffffff */
- rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */
- xor r28,r28,r5
- or r29,r3,r29 /* VA */
+3: /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
+ /*
+ * clrldi r3,r3,64 - SID_SHIFT_1T --> ea & 0xffffffffff
+ * srdi r28,r3,VPN_SHIFT
+ */
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
+ or r29,r28,r29
+
+ /*
+ * Calculate hash value for primary slot and
+ * store it in r28 for 1T segment
+ * r3 = va, r5 = vsid
+ */
+ sldi r28,r5,25 /* vsid << 25 */
+ /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
+ rldicl r0,r3,64-12,36
+ xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
xor r28,r28,r0 /* hash */
/* Convert linux PTE bits into HW equivalents */
@@ -438,7 +464,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
andc r0,r3,r0 /* r0 = pte & ~r0 */
rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ ori r3,r3,HPTE_R_C | HPTE_R_M
/* We eventually do the icache sync here (maybe inline that
* code rather than call a C function...)
@@ -446,13 +475,13 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
BEGIN_FTR_SECTION
mr r4,r30
mr r5,r7
- bl .hash_page_do_lazy_icache
+ bl hash_page_do_lazy_icache
END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* At this point, r3 contains new PP bits, save them in
* place of "access" in the param area (sic)
*/
- std r3,STK_PARM(r4)(r1)
+ std r3,STK_PARAM(R4)(r1)
/* Get htab_hash_mask */
ld r4,htab_hash_mask@got(2)
@@ -473,8 +502,8 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
andis. r0,r31,_PAGE_COMBO@h
beq htab_inval_old_hpte
- ld r6,STK_PARM(r6)(r1)
- ori r26,r6,0x8000 /* Load the hidx mask */
+ ld r6,STK_PARAM(R6)(r1)
+ ori r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */
ld r26,0(r26)
addi r5,r25,36 /* Check actual HPTE_SUB bit, this */
rldcr. r0,r31,r5,0 /* must match pgtable.h definition */
@@ -495,12 +524,14 @@ htab_special_pfn:
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge htab_pte_insert_ok /* Insertion successful */
@@ -522,12 +553,14 @@ _GLOBAL(htab_call_hpte_insert1)
rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge+ htab_pte_insert_ok /* Insertion successful */
@@ -544,7 +577,8 @@ _GLOBAL(htab_call_hpte_insert2)
2: and r0,r5,r27
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
bl . /* patched by htab_finish_init() */
/* Try all again */
@@ -555,13 +589,13 @@ _GLOBAL(htab_call_hpte_remove)
* useless now that the segment has been switched to 4k pages.
*/
htab_inval_old_hpte:
- mr r3,r29 /* virtual addr */
+ mr r3,r29 /* vpn */
mr r4,r31 /* PTE.pte */
li r5,0 /* PTE.hidx */
li r6,MMU_PAGE_64K /* psize */
- ld r7,STK_PARM(r9)(r1) /* ssize */
- ld r8,STK_PARM(r8)(r1) /* local */
- bl .flush_hash_page
+ ld r7,STK_PARAM(R9)(r1) /* ssize */
+ ld r8,STK_PARAM(R8)(r1) /* local */
+ bl flush_hash_page
/* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */
lis r0,_PAGE_HPTE_SUB@h
ori r0,r0,_PAGE_HPTE_SUB@l
@@ -576,7 +610,7 @@ htab_pte_insert_ok:
/* Insert slot number & secondary bit in PTE second half,
* clear _PAGE_BUSY and set approriate HPTE slot bit
*/
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
li r0,_PAGE_BUSY
andc r30,r30,r0
/* HPTE SUB bit */
@@ -591,19 +625,19 @@ htab_pte_insert_ok:
sld r4,r4,r5
andc r26,r26,r4
or r26,r26,r3
- ori r5,r6,0x8000
+ ori r5,r6,PTE_PAGE_HIDX_OFFSET
std r26,0(r5)
lwsync
std r30,0(r6)
li r3, 0
htab_bail:
- ld r25,STK_REG(r25)(r1)
- ld r26,STK_REG(r26)(r1)
- ld r27,STK_REG(r27)(r1)
- ld r28,STK_REG(r28)(r1)
- ld r29,STK_REG(r29)(r1)
- ld r30,STK_REG(r30)(r1)
- ld r31,STK_REG(r31)(r1)
+ ld r25,STK_REG(R25)(r1)
+ ld r26,STK_REG(R26)(r1)
+ ld r27,STK_REG(R27)(r1)
+ ld r28,STK_REG(R28)(r1)
+ ld r29,STK_REG(R29)(r1)
+ ld r30,STK_REG(R30)(r1)
+ ld r31,STK_REG(R31)(r1)
addi r1,r1,STACKFRAMESIZE
ld r0,16(r1)
mtlr r0
@@ -628,11 +662,13 @@ htab_modify_pte:
add r3,r0,r3 /* add slot idx */
/* Call ppc_md.hpte_updatepp */
- mr r5,r29 /* va */
- li r6,MMU_PAGE_4K /* page size */
- ld r7,STK_PARM(r9)(r1) /* segment size */
- ld r8,STK_PARM(r8)(r1) /* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+ mr r5,r29 /* vpn */
+ li r6,MMU_PAGE_4K /* base page size */
+ li r7,MMU_PAGE_4K /* actual page size */
+ ld r8,STK_PARAM(R9)(r1) /* segment size */
+ ld r9,STK_PARAM(R8)(r1) /* get "local" param */
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
bl . /* patched by htab_finish_init() */
/* if we failed because typically the HPTE wasn't really here
@@ -644,7 +680,7 @@ _GLOBAL(htab_call_hpte_updatepp)
/* Clear the BUSY bit and Write out the PTE */
li r0,_PAGE_BUSY
andc r30,r30,r0
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r30,0(r6)
li r3,0
b htab_bail
@@ -657,7 +693,7 @@ htab_wrong_access:
htab_pte_insert_failure:
/* Bail out restoring old PTE */
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r31,0(r6)
li r3,-1
b htab_bail
@@ -677,22 +713,22 @@ _GLOBAL(__hash_page_64K)
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
/* Save all params that we need after a function call */
- std r6,STK_PARM(r6)(r1)
- std r8,STK_PARM(r8)(r1)
- std r9,STK_PARM(r9)(r1)
+ std r6,STK_PARAM(R6)(r1)
+ std r8,STK_PARAM(R8)(r1)
+ std r9,STK_PARAM(R9)(r1)
/* Save non-volatile registers.
* r31 will hold "old PTE"
* r30 is "new PTE"
- * r29 is "va"
+ * r29 is vpn
* r28 is a hash value
* r27 is hashtab mask (maybe dynamic patched instead ?)
*/
- std r27,STK_REG(r27)(r1)
- std r28,STK_REG(r28)(r1)
- std r29,STK_REG(r29)(r1)
- std r30,STK_REG(r30)(r1)
- std r31,STK_REG(r31)(r1)
+ std r27,STK_REG(R27)(r1)
+ std r28,STK_REG(R28)(r1)
+ std r29,STK_REG(R29)(r1)
+ std r30,STK_REG(R30)(r1)
+ std r31,STK_REG(R31)(r1)
/* Step 1:
*
@@ -737,25 +773,32 @@ BEGIN_FTR_SECTION
cmpdi r9,0 /* check segment size */
bne 3f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- /* Calc va and put it in r29 */
- rldicr r29,r5,28,63-28
- rldicl r3,r3,0,36
- or r29,r3,r29
-
- /* Calculate hash value for primary slot and store it in r28 */
- rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
- rldicl r0,r3,64-16,52 /* (ea >> 16) & 0xfff */
- xor r28,r5,r0
+ /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT - VPN_SHIFT
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
+ or r29,r28,r29
+
+ /* Calculate hash value for primary slot and store it in r28
+ * r3 = va, r5 = vsid
+ * r0 = (va >> 16) & ((1ul << (28 - 16)) -1)
+ */
+ rldicl r0,r3,64-16,52
+ xor r28,r5,r0 /* hash */
b 4f
-3: /* Calc VA and hash in r29 and r28 for 1T segment */
- sldi r29,r5,40 /* vsid << 40 */
- clrldi r3,r3,24 /* ea & 0xffffffffff */
- rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */
- clrldi r5,r5,40 /* vsid & 0xffffff */
- rldicl r0,r3,64-16,40 /* (ea >> 16) & 0xffffff */
- xor r28,r28,r5
- or r29,r3,r29 /* VA */
+3: /* Calc vpn and put it in r29 */
+ sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
+ rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
+ or r29,r28,r29
+ /*
+ * calculate hash value for primary slot and
+ * store it in r28 for 1T segment
+ * r3 = va, r5 = vsid
+ */
+ sldi r28,r5,25 /* vsid << 25 */
+ /* r0 = (va >> 16) & ((1ul << (40 - 16)) -1) */
+ rldicl r0,r3,64-16,40
+ xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
xor r28,r28,r0 /* hash */
/* Convert linux PTE bits into HW equivalents */
@@ -766,7 +809,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
andc r0,r30,r0 /* r0 = pte & ~r0 */
rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ ori r3,r3,HPTE_R_C | HPTE_R_M
/* We eventually do the icache sync here (maybe inline that
* code rather than call a C function...)
@@ -774,13 +820,13 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
BEGIN_FTR_SECTION
mr r4,r30
mr r5,r7
- bl .hash_page_do_lazy_icache
+ bl hash_page_do_lazy_icache
END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* At this point, r3 contains new PP bits, save them in
* place of "access" in the param area (sic)
*/
- std r3,STK_PARM(r4)(r1)
+ std r3,STK_PARAM(R4)(r1)
/* Get htab_hash_mask */
ld r4,htab_hash_mask@got(2)
@@ -813,12 +859,14 @@ ht64_insert_pte:
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_64K
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(ht64_call_hpte_insert1)
+ li r9,MMU_PAGE_64K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl ht64_call_hpte_insert1
+ht64_call_hpte_insert1:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge ht64_pte_insert_ok /* Insertion successful */
@@ -836,12 +884,14 @@ _GLOBAL(ht64_call_hpte_insert1)
rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
/* Call ppc_md.hpte_insert */
- ld r6,STK_PARM(r4)(r1) /* Retrieve new pp bits */
- mr r4,r29 /* Retrieve va */
+ ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */
+ mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_64K
- ld r9,STK_PARM(r9)(r1) /* segment size */
-_GLOBAL(ht64_call_hpte_insert2)
+ li r9,MMU_PAGE_64K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl ht64_call_hpte_insert2
+ht64_call_hpte_insert2:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge+ ht64_pte_insert_ok /* Insertion successful */
@@ -858,7 +908,8 @@ _GLOBAL(ht64_call_hpte_insert2)
2: and r0,r5,r27
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_remove */
-_GLOBAL(ht64_call_hpte_remove)
+.globl ht64_call_hpte_remove
+ht64_call_hpte_remove:
bl . /* patched by htab_finish_init() */
/* Try all again */
@@ -876,15 +927,15 @@ ht64_pte_insert_ok:
* (maybe add eieio may be good still ?)
*/
ht64_write_out_pte:
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r30,0(r6)
li r3, 0
ht64_bail:
- ld r27,STK_REG(r27)(r1)
- ld r28,STK_REG(r28)(r1)
- ld r29,STK_REG(r29)(r1)
- ld r30,STK_REG(r30)(r1)
- ld r31,STK_REG(r31)(r1)
+ ld r27,STK_REG(R27)(r1)
+ ld r28,STK_REG(R28)(r1)
+ ld r29,STK_REG(R29)(r1)
+ ld r30,STK_REG(R30)(r1)
+ ld r31,STK_REG(R31)(r1)
addi r1,r1,STACKFRAMESIZE
ld r0,16(r1)
mtlr r0
@@ -907,11 +958,13 @@ ht64_modify_pte:
add r3,r0,r3 /* add slot idx */
/* Call ppc_md.hpte_updatepp */
- mr r5,r29 /* va */
- li r6,MMU_PAGE_64K
- ld r7,STK_PARM(r9)(r1) /* segment size */
- ld r8,STK_PARM(r8)(r1) /* get "local" param */
-_GLOBAL(ht64_call_hpte_updatepp)
+ mr r5,r29 /* vpn */
+ li r6,MMU_PAGE_64K /* base page size */
+ li r7,MMU_PAGE_64K /* actual page size */
+ ld r8,STK_PARAM(R9)(r1) /* segment size */
+ ld r9,STK_PARAM(R8)(r1) /* get "local" param */
+.globl ht64_call_hpte_updatepp
+ht64_call_hpte_updatepp:
bl . /* patched by htab_finish_init() */
/* if we failed because typically the HPTE wasn't really here
@@ -933,7 +986,7 @@ ht64_wrong_access:
ht64_pte_insert_failure:
/* Bail out restoring old PTE */
- ld r6,STK_PARM(r6)(r1)
+ ld r6,STK_PARAM(R6)(r1)
std r31,0(r6)
li r3,-1
b ht64_bail
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 90039bc6411..cf1d325eae8 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -14,10 +14,10 @@
#include <linux/spinlock.h>
#include <linux/bitops.h>
+#include <linux/of.h>
#include <linux/threads.h>
#include <linux/smp.h>
-#include <asm/abs_addr.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
@@ -35,30 +35,61 @@
#define DBG_LOW(fmt...)
#endif
+#ifdef __BIG_ENDIAN__
#define HPTE_LOCK_BIT 3
+#else
+#define HPTE_LOCK_BIT (56+3)
+#endif
DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-static inline void __tlbie(unsigned long va, int psize, int ssize)
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
{
+ unsigned long va;
unsigned int penc;
+ unsigned long sllp;
- /* clear top 16 bits, non SLS segment */
+ /*
+ * We need 14 to 65 bits of va for a tlibe of 4K page
+ * With vpn we ignore the lower VPN_SHIFT bits already.
+ * And top two bits are already ignored because we can
+ * only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT
+ * of 12.
+ */
+ va = vpn << VPN_SHIFT;
+ /*
+ * clear top 16 bits of 64bit va, non SLS segment
+ * Older versions of the architecture (2.02 and earler) require the
+ * masking of the top 16 bits.
+ */
va &= ~(0xffffULL << 48);
switch (psize) {
case MMU_PAGE_4K:
- va &= ~0xffful;
+ /* clear out bits after (52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
+ sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+ ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+ va |= sllp << 5;
asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
default:
- penc = mmu_psize_defs[psize].penc;
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ /* We need 14 to 14 + i bits of va */
+ penc = mmu_psize_defs[psize].penc[apsize];
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe); /* AVAL */
va |= 1; /* L */
asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
@@ -67,25 +98,46 @@ static inline void __tlbie(unsigned long va, int psize, int ssize)
}
}
-static inline void __tlbiel(unsigned long va, int psize, int ssize)
+static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
{
+ unsigned long va;
unsigned int penc;
+ unsigned long sllp;
- /* clear top 16 bits, non SLS segment */
+ /* VPN_SHIFT can be atmost 12 */
+ va = vpn << VPN_SHIFT;
+ /*
+ * clear top 16 bits of 64 bit va, non SLS segment
+ * Older versions of the architecture (2.02 and earler) require the
+ * masking of the top 16 bits.
+ */
va &= ~(0xffffULL << 48);
switch (psize) {
case MMU_PAGE_4K:
- va &= ~0xffful;
+ /* clear out bits after(52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
+ sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+ ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+ va |= sllp << 5;
asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
: : "r"(va) : "memory");
break;
default:
- penc = mmu_psize_defs[psize].penc;
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ /* We need 14 to 14 + i bits of va */
+ penc = mmu_psize_defs[psize].penc[apsize];
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe);
va |= 1; /* L */
asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
: : "r"(va) : "memory");
@@ -94,7 +146,8 @@ static inline void __tlbiel(unsigned long va, int psize, int ssize)
}
-static inline void tlbie(unsigned long va, int psize, int ssize, int local)
+static inline void tlbie(unsigned long vpn, int psize, int apsize,
+ int ssize, int local)
{
unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL);
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
@@ -105,10 +158,10 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local)
raw_spin_lock(&native_tlbie_lock);
asm volatile("ptesync": : :"memory");
if (use_local) {
- __tlbiel(va, psize, ssize);
+ __tlbiel(vpn, psize, apsize, ssize);
asm volatile("ptesync": : :"memory");
} else {
- __tlbie(va, psize, ssize);
+ __tlbie(vpn, psize, apsize, ssize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
if (lock_tlbie && !use_local)
@@ -117,7 +170,7 @@ static inline void tlbie(unsigned long va, int psize, int ssize, int local)
static inline void native_lock_hpte(struct hash_pte *hptep)
{
- unsigned long *word = &hptep->v;
+ unsigned long *word = (unsigned long *)&hptep->v;
while (1) {
if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
@@ -129,30 +182,30 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
static inline void native_unlock_hpte(struct hash_pte *hptep)
{
- unsigned long *word = &hptep->v;
+ unsigned long *word = (unsigned long *)&hptep->v;
clear_bit_unlock(HPTE_LOCK_BIT, word);
}
-static long native_hpte_insert(unsigned long hpte_group, unsigned long va,
+static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
unsigned long pa, unsigned long rflags,
- unsigned long vflags, int psize, int ssize)
+ unsigned long vflags, int psize, int apsize, int ssize)
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
int i;
if (!(vflags & HPTE_V_BOLTED)) {
- DBG_LOW(" insert(group=%lx, va=%016lx, pa=%016lx,"
+ DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
" rflags=%lx, vflags=%lx, psize=%d)\n",
- hpte_group, va, pa, rflags, vflags, psize);
+ hpte_group, vpn, pa, rflags, vflags, psize);
}
for (i = 0; i < HPTES_PER_GROUP; i++) {
- if (! (hptep->v & HPTE_V_VALID)) {
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
/* retry with lock held */
native_lock_hpte(hptep);
- if (! (hptep->v & HPTE_V_VALID))
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
break;
native_unlock_hpte(hptep);
}
@@ -163,22 +216,22 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long va,
if (i == HPTES_PER_GROUP)
return -1;
- hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID;
- hpte_r = hpte_encode_r(pa, psize) | rflags;
+ hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+ hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
i, hpte_v, hpte_r);
}
- hptep->r = hpte_r;
+ hptep->r = cpu_to_be64(hpte_r);
/* Guarantee the second dword is visible before the valid bit */
eieio();
/*
* Now set the first dword including the valid bit
* NOTE: this also unlocks the hpte
*/
- hptep->v = hpte_v;
+ hptep->v = cpu_to_be64(hpte_v);
__asm__ __volatile__ ("ptesync" : : : "memory");
@@ -199,12 +252,12 @@ static long native_hpte_remove(unsigned long hpte_group)
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + hpte_group + slot_offset;
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
/* retry with lock held */
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID)
&& !(hpte_v & HPTE_V_BOLTED))
break;
@@ -225,41 +278,46 @@ static long native_hpte_remove(unsigned long hpte_group)
}
static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
- unsigned long va, int psize, int ssize,
- int local)
+ unsigned long vpn, int bpsize,
+ int apsize, int ssize, int local)
{
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
int ret = 0;
- want_v = hpte_encode_v(va, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, bpsize, ssize);
- DBG_LOW(" update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)",
- va, want_v & HPTE_V_AVPN, slot, newpp);
+ DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
+ vpn, want_v & HPTE_V_AVPN, slot, newpp);
native_lock_hpte(hptep);
- hpte_v = hptep->v;
-
- /* Even if we miss, we need to invalidate the TLB */
+ hpte_v = be64_to_cpu(hptep->v);
+ /*
+ * We need to invalidate the TLB always because hpte_remove doesn't do
+ * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+ * random entry from it. When we do that we don't invalidate the TLB
+ * (hpte_remove) because we assume the old translation is still
+ * technically "valid".
+ */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
DBG_LOW(" -> miss\n");
ret = -1;
} else {
DBG_LOW(" -> hit\n");
/* Update the HPTE */
- hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
- (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & ~(HPTE_R_PP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)));
}
native_unlock_hpte(hptep);
/* Ensure it is out of the tlb too. */
- tlbie(va, psize, ssize, local);
+ tlbie(vpn, bpsize, apsize, ssize, local);
return ret;
}
-static long native_hpte_find(unsigned long va, int psize, int ssize)
+static long native_hpte_find(unsigned long vpn, int psize, int ssize)
{
struct hash_pte *hptep;
unsigned long hash;
@@ -267,14 +325,14 @@ static long native_hpte_find(unsigned long va, int psize, int ssize)
long slot;
unsigned long want_v, hpte_v;
- hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize);
- want_v = hpte_encode_v(va, psize, ssize);
+ hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
/* Bolted mappings are only ever in the primary group */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + slot;
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
/* HPTE matches */
@@ -295,28 +353,32 @@ static long native_hpte_find(unsigned long va, int psize, int ssize)
static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
int psize, int ssize)
{
- unsigned long vsid, va;
+ unsigned long vpn;
+ unsigned long vsid;
long slot;
struct hash_pte *hptep;
vsid = get_kernel_vsid(ea, ssize);
- va = hpt_va(ea, vsid, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
- slot = native_hpte_find(va, psize, ssize);
+ slot = native_hpte_find(vpn, psize, ssize);
if (slot == -1)
panic("could not find page to bolt\n");
hptep = htab_address + slot;
/* Update the HPTE */
- hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
- (newpp & (HPTE_R_PP | HPTE_R_N));
-
- /* Ensure it is out of the tlb too. */
- tlbie(va, psize, ssize, 0);
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+ ~(HPTE_R_PP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PP | HPTE_R_N)));
+ /*
+ * Ensure it is out of the tlb too. Bolted entries base and
+ * actual page size will be same.
+ */
+ tlbie(vpn, psize, psize, ssize, 0);
}
-static void native_hpte_invalidate(unsigned long slot, unsigned long va,
- int psize, int ssize, int local)
+static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
+ int bpsize, int apsize, int ssize, int local)
{
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v;
@@ -325,13 +387,19 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va,
local_irq_save(flags);
- DBG_LOW(" invalidate(va=%016lx, hash: %x)\n", va, slot);
+ DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
- want_v = hpte_encode_v(va, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, bpsize, ssize);
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
- /* Even if we miss, we need to invalidate the TLB */
+ /*
+ * We need to invalidate the TLB always because hpte_remove doesn't do
+ * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+ * random entry from it. When we do that we don't invalidate the TLB
+ * (hpte_remove) because we assume the old translation is still
+ * technically "valid".
+ */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
else
@@ -339,73 +407,175 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long va,
hptep->v = 0;
/* Invalidate the TLB */
- tlbie(va, psize, ssize, local);
+ tlbie(vpn, bpsize, apsize, ssize, local);
+
+ local_irq_restore(flags);
+}
+
+static void native_hugepage_invalidate(struct mm_struct *mm,
+ unsigned char *hpte_slot_array,
+ unsigned long addr, int psize)
+{
+ int ssize = 0, i;
+ int lock_tlbie;
+ struct hash_pte *hptep;
+ int actual_psize = MMU_PAGE_16M;
+ unsigned int max_hpte_count, valid;
+ unsigned long flags, s_addr = addr;
+ unsigned long hpte_v, want_v, shift;
+ unsigned long hidx, vpn = 0, vsid, hash, slot;
+
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+ local_irq_save(flags);
+ for (i = 0; i < max_hpte_count; i++) {
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_vsid(mm->context.id, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ hptep = htab_address + slot;
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+ native_lock_hpte(hptep);
+ hpte_v = be64_to_cpu(hptep->v);
+
+ /* Even if we miss, we need to invalidate the TLB */
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+ native_unlock_hpte(hptep);
+ else
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ hptep->v = 0;
+ }
+ /*
+ * Since this is a hugepage, we just need a single tlbie.
+ * use the last vpn.
+ */
+ lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+ if (lock_tlbie)
+ raw_spin_lock(&native_tlbie_lock);
+
+ asm volatile("ptesync":::"memory");
+ __tlbie(vpn, psize, actual_psize, ssize);
+ asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+ if (lock_tlbie)
+ raw_spin_unlock(&native_tlbie_lock);
local_irq_restore(flags);
}
-#define LP_SHIFT 12
-#define LP_BITS 8
-#define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT)
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+ int i, shift;
+ unsigned int mask;
+
+ /* start from 1 ignoring MMU_PAGE_4K */
+ for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+ /* invalid penc */
+ if (mmu_psize_defs[psize].penc[i] == -1)
+ continue;
+ /*
+ * encoding bits per actual page size
+ * PTE LP actual page size
+ * rrrr rrrz >=8KB
+ * rrrr rrzz >=16KB
+ * rrrr rzzz >=32KB
+ * rrrr zzzz >=64KB
+ * .......
+ */
+ shift = mmu_psize_defs[i].shift - LP_SHIFT;
+ if (shift > LP_BITS)
+ shift = LP_BITS;
+ mask = (1 << shift) - 1;
+ if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+ return i;
+ }
+ return -1;
+}
static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
- int *psize, int *ssize, unsigned long *va)
+ int *psize, int *apsize, int *ssize, unsigned long *vpn)
{
- unsigned long hpte_r = hpte->r;
- unsigned long hpte_v = hpte->v;
- unsigned long avpn;
- int i, size, shift, penc;
-
- if (!(hpte_v & HPTE_V_LARGE))
- size = MMU_PAGE_4K;
- else {
- for (i = 0; i < LP_BITS; i++) {
- if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
- break;
- }
- penc = LP_MASK(i+1) >> LP_SHIFT;
+ unsigned long avpn, pteg, vpi;
+ unsigned long hpte_v = be64_to_cpu(hpte->v);
+ unsigned long hpte_r = be64_to_cpu(hpte->r);
+ unsigned long vsid, seg_off;
+ int size, a_size, shift;
+ /* Look at the 8 bit LP value */
+ unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+ if (!(hpte_v & HPTE_V_LARGE)) {
+ size = MMU_PAGE_4K;
+ a_size = MMU_PAGE_4K;
+ } else {
for (size = 0; size < MMU_PAGE_COUNT; size++) {
- /* 4K pages are not represented by LP */
- if (size == MMU_PAGE_4K)
- continue;
-
/* valid entries have a shift value */
if (!mmu_psize_defs[size].shift)
continue;
- if (penc == mmu_psize_defs[size].penc)
+ a_size = __hpte_actual_psize(lp, size);
+ if (a_size != -1)
break;
}
}
-
/* This works for all page sizes, and for 256M and 1T segments */
+ *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
shift = mmu_psize_defs[size].shift;
- avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm) << 23;
-
- if (shift < 23) {
- unsigned long vpi, vsid, pteg;
- pteg = slot / HPTES_PER_GROUP;
- if (hpte_v & HPTE_V_SECONDARY)
- pteg = ~pteg;
- switch (hpte_v >> HPTE_V_SSIZE_SHIFT) {
- case MMU_SEGSIZE_256M:
- vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
- break;
- case MMU_SEGSIZE_1T:
- vsid = avpn >> 40;
+ avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
+ pteg = slot / HPTES_PER_GROUP;
+ if (hpte_v & HPTE_V_SECONDARY)
+ pteg = ~pteg;
+
+ switch (*ssize) {
+ case MMU_SEGSIZE_256M:
+ /* We only have 28 - 23 bits of seg_off in avpn */
+ seg_off = (avpn & 0x1f) << 23;
+ vsid = avpn >> 5;
+ /* We can find more bits from the pteg value */
+ if (shift < 23) {
+ vpi = (vsid ^ pteg) & htab_hash_mask;
+ seg_off |= vpi << shift;
+ }
+ *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
+ case MMU_SEGSIZE_1T:
+ /* We only have 40 - 23 bits of seg_off in avpn */
+ seg_off = (avpn & 0x1ffff) << 23;
+ vsid = avpn >> 17;
+ if (shift < 23) {
vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
- break;
- default:
- avpn = vpi = size = 0;
+ seg_off |= vpi << shift;
}
- avpn |= (vpi << mmu_psize_defs[size].shift);
+ *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
+ default:
+ *vpn = size = 0;
}
-
- *va = avpn;
- *psize = size;
- *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
+ *psize = size;
+ *apsize = a_size;
}
/*
@@ -418,11 +588,12 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
*/
static void native_hpte_clear(void)
{
+ unsigned long vpn = 0;
unsigned long slot, slots, flags;
struct hash_pte *hptep = htab_address;
- unsigned long hpte_v, va;
+ unsigned long hpte_v;
unsigned long pteg_count;
- int psize, ssize;
+ int psize, apsize, ssize;
pteg_count = htab_hash_mask + 1;
@@ -441,16 +612,16 @@ static void native_hpte_clear(void)
* running, right? and for crash dump, we probably
* don't want to wait for a maybe bad cpu.
*/
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
/*
* Call __tlbie() here rather than tlbie() since we
* already hold the native_tlbie_lock.
*/
if (hpte_v & HPTE_V_VALID) {
- hpte_decode(hptep, slot, &psize, &ssize, &va);
+ hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
hptep->v = 0;
- __tlbie(va, psize, ssize);
+ __tlbie(vpn, psize, apsize, ssize);
}
}
@@ -465,7 +636,8 @@ static void native_hpte_clear(void)
*/
static void native_flush_hash_range(unsigned long number, int local)
{
- unsigned long va, hash, index, hidx, shift, slot;
+ unsigned long vpn;
+ unsigned long hash, index, hidx, shift, slot;
struct hash_pte *hptep;
unsigned long hpte_v;
unsigned long want_v;
@@ -479,20 +651,20 @@ static void native_flush_hash_range(unsigned long number, int local)
local_irq_save(flags);
for (i = 0; i < number; i++) {
- va = batch->vaddr[i];
+ vpn = batch->vpn[i];
pte = batch->pte[i];
- pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
- hash = hpt_hash(va, shift, ssize);
+ pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+ hash = hpt_hash(vpn, shift, ssize);
hidx = __rpte_to_hidx(pte, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
hptep = htab_address + slot;
- want_v = hpte_encode_v(va, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if (!HPTE_V_COMPARE(hpte_v, want_v) ||
!(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
@@ -505,12 +677,12 @@ static void native_flush_hash_range(unsigned long number, int local)
mmu_psize_defs[psize].tlbiel && local) {
asm volatile("ptesync":::"memory");
for (i = 0; i < number; i++) {
- va = batch->vaddr[i];
+ vpn = batch->vpn[i];
pte = batch->pte[i];
- pte_iterate_hashed_subpages(pte, psize, va, index,
- shift) {
- __tlbiel(va, psize, ssize);
+ pte_iterate_hashed_subpages(pte, psize,
+ vpn, index, shift) {
+ __tlbiel(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("ptesync":::"memory");
@@ -522,12 +694,12 @@ static void native_flush_hash_range(unsigned long number, int local)
asm volatile("ptesync":::"memory");
for (i = 0; i < number; i++) {
- va = batch->vaddr[i];
+ vpn = batch->vpn[i];
pte = batch->pte[i];
- pte_iterate_hashed_subpages(pte, psize, va, index,
- shift) {
- __tlbie(va, psize, ssize);
+ pte_iterate_hashed_subpages(pte, psize,
+ vpn, index, shift) {
+ __tlbie(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("eieio; tlbsync; ptesync":::"memory");
@@ -539,29 +711,6 @@ static void native_flush_hash_range(unsigned long number, int local)
local_irq_restore(flags);
}
-#ifdef CONFIG_PPC_PSERIES
-/* Disable TLB batching on nighthawk */
-static inline int tlb_batching_enabled(void)
-{
- struct device_node *root = of_find_node_by_path("/");
- int enabled = 1;
-
- if (root) {
- const char *model = of_get_property(root, "model", NULL);
- if (model && !strcmp(model, "IBM,9076-N81"))
- enabled = 0;
- of_node_put(root);
- }
-
- return enabled;
-}
-#else
-static inline int tlb_batching_enabled(void)
-{
- return 1;
-}
-#endif
-
void __init hpte_init_native(void)
{
ppc_md.hpte_invalidate = native_hpte_invalidate;
@@ -570,6 +719,6 @@ void __init hpte_init_native(void)
ppc_md.hpte_insert = native_hpte_insert;
ppc_md.hpte_remove = native_hpte_remove;
ppc_md.hpte_clear_all = native_hpte_clear;
- if (tlb_batching_enabled())
- ppc_md.flush_hash_range = native_flush_hash_range;
+ ppc_md.flush_hash_range = native_flush_hash_range;
+ ppc_md.hugepage_invalidate = native_hugepage_invalidate;
}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 2d282186cb4..88fdd9d2507 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -33,6 +33,7 @@
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/memblock.h>
+#include <linux/context_tracking.h>
#include <asm/processor.h>
#include <asm/pgtable.h>
@@ -40,11 +41,9 @@
#include <asm/mmu_context.h>
#include <asm/page.h>
#include <asm/types.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/machdep.h>
#include <asm/prom.h>
-#include <asm/abs_addr.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
#include <asm/eeh.h>
@@ -55,6 +54,9 @@
#include <asm/spu.h>
#include <asm/udbg.h>
#include <asm/code-patching.h>
+#include <asm/fadump.h>
+#include <asm/firmware.h>
+#include <asm/tm.h>
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
@@ -125,7 +127,7 @@ static struct mmu_psize_def mmu_psize_defaults_old[] = {
[MMU_PAGE_4K] = {
.shift = 12,
.sllp = 0,
- .penc = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
.avpnm = 0,
.tlbiel = 0,
},
@@ -139,14 +141,15 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
[MMU_PAGE_4K] = {
.shift = 12,
.sllp = 0,
- .penc = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
.avpnm = 0,
.tlbiel = 1,
},
[MMU_PAGE_16M] = {
.shift = 24,
.sllp = SLB_VSID_L,
- .penc = 0,
+ .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
+ [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
.avpnm = 0x1UL,
.tlbiel = 0,
},
@@ -166,9 +169,10 @@ static unsigned long htab_convert_pte_flags(unsigned long pteflags)
if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
(pteflags & _PAGE_DIRTY)))
rflags |= 1;
-
- /* Always add C */
- return rflags | HPTE_R_C;
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ return rflags | HPTE_R_C | HPTE_R_M;
}
int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
@@ -191,19 +195,42 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
vaddr += step, paddr += step) {
unsigned long hash, hpteg;
unsigned long vsid = get_kernel_vsid(vaddr, ssize);
- unsigned long va = hpt_va(vaddr, vsid, ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
unsigned long tprot = prot;
+ /*
+ * If we hit a bad address return error.
+ */
+ if (!vsid)
+ return -1;
/* Make kernel text executable */
if (overlaps_kernel_text(vaddr, vaddr + step))
tprot &= ~HPTE_R_N;
- hash = hpt_hash(va, shift, ssize);
+ /* Make kvm guest trampolines executable */
+ if (overlaps_kvm_tmp(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ /*
+ * If relocatable, check if it overlaps interrupt vectors that
+ * are copied down to real 0. For relocatable kernel
+ * (e.g. kdump case) we copy interrupt vectors down to real
+ * address 0. Mark that region as executable. This is
+ * because on p8 system with relocation on exception feature
+ * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+ * in order to execute the interrupt handlers in virtual
+ * mode the vector region need to be marked as executable.
+ */
+ if ((PHYSICAL_START > MEMORY_START) &&
+ overlaps_interrupt_vector_text(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ hash = hpt_hash(vpn, shift, ssize);
hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
BUG_ON(!ppc_md.hpte_insert);
- ret = ppc_md.hpte_insert(hpteg, va, paddr, tprot,
- HPTE_V_BOLTED, psize, ssize);
+ ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot,
+ HPTE_V_BOLTED, psize, psize, ssize);
if (ret < 0)
break;
@@ -242,20 +269,19 @@ static int __init htab_dt_scan_seg_sizes(unsigned long node,
const char *uname, int depth,
void *data)
{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
- unsigned long size = 0;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes",
- &size);
+ prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
if (prop == NULL)
return 0;
for (; size >= 4; size -= 4, ++prop) {
- if (prop[0] == 40) {
+ if (be32_to_cpu(prop[0]) == 40) {
DBG("1T segment support detected\n");
cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
return 1;
@@ -270,79 +296,103 @@ static void __init htab_init_seg_sizes(void)
of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
}
+static int __init get_idx_from_shift(unsigned int shift)
+{
+ int idx = -1;
+
+ switch (shift) {
+ case 0xc:
+ idx = MMU_PAGE_4K;
+ break;
+ case 0x10:
+ idx = MMU_PAGE_64K;
+ break;
+ case 0x14:
+ idx = MMU_PAGE_1M;
+ break;
+ case 0x18:
+ idx = MMU_PAGE_16M;
+ break;
+ case 0x22:
+ idx = MMU_PAGE_16G;
+ break;
+ }
+ return idx;
+}
+
static int __init htab_dt_scan_page_sizes(unsigned long node,
const char *uname, int depth,
void *data)
{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
- unsigned long size = 0;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- prop = (u32 *)of_get_flat_dt_prop(node,
- "ibm,segment-page-sizes", &size);
+ prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
if (prop != NULL) {
- DBG("Page sizes from device-tree:\n");
+ pr_info("Page sizes from device-tree:\n");
size /= 4;
cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
while(size > 0) {
- unsigned int shift = prop[0];
- unsigned int slbenc = prop[1];
- unsigned int lpnum = prop[2];
- unsigned int lpenc = 0;
+ unsigned int base_shift = be32_to_cpu(prop[0]);
+ unsigned int slbenc = be32_to_cpu(prop[1]);
+ unsigned int lpnum = be32_to_cpu(prop[2]);
struct mmu_psize_def *def;
- int idx = -1;
+ int idx, base_idx;
size -= 3; prop += 3;
- while(size > 0 && lpnum) {
- if (prop[0] == shift)
- lpenc = prop[1];
- prop += 2; size -= 2;
- lpnum--;
+ base_idx = get_idx_from_shift(base_shift);
+ if (base_idx < 0) {
+ /*
+ * skip the pte encoding also
+ */
+ prop += lpnum * 2; size -= lpnum * 2;
+ continue;
}
- switch(shift) {
- case 0xc:
- idx = MMU_PAGE_4K;
- break;
- case 0x10:
- idx = MMU_PAGE_64K;
- break;
- case 0x14:
- idx = MMU_PAGE_1M;
- break;
- case 0x18:
- idx = MMU_PAGE_16M;
+ def = &mmu_psize_defs[base_idx];
+ if (base_idx == MMU_PAGE_16M)
cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
- break;
- case 0x22:
- idx = MMU_PAGE_16G;
- break;
- }
- if (idx < 0)
- continue;
- def = &mmu_psize_defs[idx];
- def->shift = shift;
- if (shift <= 23)
+
+ def->shift = base_shift;
+ if (base_shift <= 23)
def->avpnm = 0;
else
- def->avpnm = (1 << (shift - 23)) - 1;
+ def->avpnm = (1 << (base_shift - 23)) - 1;
def->sllp = slbenc;
- def->penc = lpenc;
- /* We don't know for sure what's up with tlbiel, so
+ /*
+ * We don't know for sure what's up with tlbiel, so
* for now we only set it for 4K and 64K pages
*/
- if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K)
+ if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
def->tlbiel = 1;
else
def->tlbiel = 0;
- DBG(" %d: shift=%02x, sllp=%04lx, avpnm=%08lx, "
- "tlbiel=%d, penc=%d\n",
- idx, shift, def->sllp, def->avpnm, def->tlbiel,
- def->penc);
+ while (size > 0 && lpnum) {
+ unsigned int shift = be32_to_cpu(prop[0]);
+ int penc = be32_to_cpu(prop[1]);
+
+ prop += 2; size -= 2;
+ lpnum--;
+
+ idx = get_idx_from_shift(shift);
+ if (idx < 0)
+ continue;
+
+ if (penc == -1)
+ pr_err("Invalid penc for base_shift=%d "
+ "shift=%d\n", base_shift, shift);
+
+ def->penc[idx] = penc;
+ pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
+ " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
+ base_shift, shift, def->sllp,
+ def->avpnm, def->tlbiel, def->penc[idx]);
+ }
}
return 1;
}
@@ -356,9 +406,9 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
const char *uname, int depth,
void *data) {
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- unsigned long *addr_prop;
- u32 *page_count_prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be64 *addr_prop;
+ const __be32 *page_count_prop;
unsigned int expected_pages;
long unsigned int phys_addr;
long unsigned int block_size;
@@ -372,12 +422,12 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
if (page_count_prop == NULL)
return 0;
- expected_pages = (1 << page_count_prop[0]);
+ expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
if (addr_prop == NULL)
return 0;
- phys_addr = addr_prop[0];
- block_size = addr_prop[1];
+ phys_addr = be64_to_cpu(addr_prop[0]);
+ block_size = be64_to_cpu(addr_prop[1]);
if (block_size != (16 * GB))
return 0;
printk(KERN_INFO "Huge page(16GB) memory: "
@@ -391,10 +441,39 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
}
#endif /* CONFIG_HUGETLB_PAGE */
+static void mmu_psize_set_default_penc(void)
+{
+ int bpsize, apsize;
+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+ for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
+ mmu_psize_defs[bpsize].penc[apsize] = -1;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+
+static bool might_have_hea(void)
+{
+ /*
+ * The HEA ethernet adapter requires awareness of the
+ * GX bus. Without that awareness we can easily assume
+ * we will never see an HEA ethernet device.
+ */
+#ifdef CONFIG_IBMEBUS
+ return !cpu_has_feature(CPU_FTR_ARCH_207S);
+#else
+ return false;
+#endif
+}
+
+#endif /* #ifdef CONFIG_PPC_64K_PAGES */
+
static void __init htab_init_page_sizes(void)
{
int rc;
+ /* se the invalid penc to -1 */
+ mmu_psize_set_default_penc();
+
/* Default to 4K pages only */
memcpy(mmu_psize_defs, mmu_psize_defaults_old,
sizeof(mmu_psize_defaults_old));
@@ -442,10 +521,11 @@ static void __init htab_init_page_sizes(void)
mmu_linear_psize = MMU_PAGE_64K;
if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
/*
- * Don't use 64k pages for ioremap on pSeries, since
- * that would stop us accessing the HEA ethernet.
+ * When running on pSeries using 64k pages for ioremap
+ * would stop us accessing the HEA ethernet. So if we
+ * have the chance of ever seeing one, stay at 4k.
*/
- if (!machine_is(pseries))
+ if (!might_have_hea() || !machine_is(pseries))
mmu_io_psize = MMU_PAGE_64K;
} else
mmu_ci_restrictions = 1;
@@ -489,17 +569,17 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
const char *uname, int depth,
void *data)
{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+ prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
if (prop != NULL) {
/* pft_size[0] is the NUMA CEC cookie */
- ppc64_pft_size = prop[1];
+ ppc64_pft_size = be32_to_cpu(prop[1]);
return 1;
}
return 0;
@@ -546,47 +626,43 @@ int remove_section_mapping(unsigned long start, unsigned long end)
}
#endif /* CONFIG_MEMORY_HOTPLUG */
-#define FUNCTION_TEXT(A) ((*(unsigned long *)(A)))
+extern u32 htab_call_hpte_insert1[];
+extern u32 htab_call_hpte_insert2[];
+extern u32 htab_call_hpte_remove[];
+extern u32 htab_call_hpte_updatepp[];
+extern u32 ht64_call_hpte_insert1[];
+extern u32 ht64_call_hpte_insert2[];
+extern u32 ht64_call_hpte_remove[];
+extern u32 ht64_call_hpte_updatepp[];
static void __init htab_finish_init(void)
{
- extern unsigned int *htab_call_hpte_insert1;
- extern unsigned int *htab_call_hpte_insert2;
- extern unsigned int *htab_call_hpte_remove;
- extern unsigned int *htab_call_hpte_updatepp;
-
#ifdef CONFIG_PPC_HAS_HASH_64K
- extern unsigned int *ht64_call_hpte_insert1;
- extern unsigned int *ht64_call_hpte_insert2;
- extern unsigned int *ht64_call_hpte_remove;
- extern unsigned int *ht64_call_hpte_updatepp;
-
patch_branch(ht64_call_hpte_insert1,
- FUNCTION_TEXT(ppc_md.hpte_insert),
+ ppc_function_entry(ppc_md.hpte_insert),
BRANCH_SET_LINK);
patch_branch(ht64_call_hpte_insert2,
- FUNCTION_TEXT(ppc_md.hpte_insert),
+ ppc_function_entry(ppc_md.hpte_insert),
BRANCH_SET_LINK);
patch_branch(ht64_call_hpte_remove,
- FUNCTION_TEXT(ppc_md.hpte_remove),
+ ppc_function_entry(ppc_md.hpte_remove),
BRANCH_SET_LINK);
patch_branch(ht64_call_hpte_updatepp,
- FUNCTION_TEXT(ppc_md.hpte_updatepp),
+ ppc_function_entry(ppc_md.hpte_updatepp),
BRANCH_SET_LINK);
-
#endif /* CONFIG_PPC_HAS_HASH_64K */
patch_branch(htab_call_hpte_insert1,
- FUNCTION_TEXT(ppc_md.hpte_insert),
+ ppc_function_entry(ppc_md.hpte_insert),
BRANCH_SET_LINK);
patch_branch(htab_call_hpte_insert2,
- FUNCTION_TEXT(ppc_md.hpte_insert),
+ ppc_function_entry(ppc_md.hpte_insert),
BRANCH_SET_LINK);
patch_branch(htab_call_hpte_remove,
- FUNCTION_TEXT(ppc_md.hpte_remove),
+ ppc_function_entry(ppc_md.hpte_remove),
BRANCH_SET_LINK);
patch_branch(htab_call_hpte_updatepp,
- FUNCTION_TEXT(ppc_md.hpte_updatepp),
+ ppc_function_entry(ppc_md.hpte_updatepp),
BRANCH_SET_LINK);
}
@@ -625,6 +701,16 @@ static void __init htab_initialize(void)
/* Using a hypervisor which owns the htab */
htab_address = NULL;
_SDR1 = 0;
+#ifdef CONFIG_FA_DUMP
+ /*
+ * If firmware assisted dump is active firmware preserves
+ * the contents of htab along with entire partition memory.
+ * Clear the htab if firmware assisted dump is active so
+ * that we dont end up using old mappings.
+ */
+ if (is_fadump_active() && ppc_md.hpte_clear_all)
+ ppc_md.hpte_clear_all();
+#endif
} else {
/* Find storage for the HPT. Must be contiguous in
* the absolute address space. On cell we want it to be
@@ -640,7 +726,7 @@ static void __init htab_initialize(void)
DBG("Hash table allocated at %lx, size: %lx\n", table,
htab_size_bytes);
- htab_address = abs_to_virt(table);
+ htab_address = __va(table);
/* htab absolute addr + encoded htabsize */
_SDR1 = table + __ilog2(pteg_count) - 11;
@@ -745,24 +831,22 @@ void __init early_init_mmu(void)
*/
htab_initialize();
- /* Initialize stab / SLB management except on iSeries
- */
+ /* Initialize stab / SLB management */
if (mmu_has_feature(MMU_FTR_SLB))
slb_initialize();
- else if (!firmware_has_feature(FW_FEATURE_ISERIES))
+ else
stab_initialize(get_paca()->stab_real);
}
#ifdef CONFIG_SMP
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
{
/* Initialize hash table for that CPU */
if (!firmware_has_feature(FW_FEATURE_LPAR))
mtspr(SPRN_SDR1, _SDR1);
/* Initialize STAB/SLB. We use a virtual address as it works
- * in real mode on pSeries and we want a virtual address on
- * iSeries anyway
+ * in real mode on pSeries.
*/
if (mmu_has_feature(MMU_FTR_SLB))
slb_initialize();
@@ -797,16 +881,19 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
#ifdef CONFIG_PPC_MM_SLICES
unsigned int get_paca_psize(unsigned long addr)
{
- unsigned long index, slices;
+ u64 lpsizes;
+ unsigned char *hpsizes;
+ unsigned long index, mask_index;
if (addr < SLICE_LOW_TOP) {
- slices = get_paca()->context.low_slices_psize;
+ lpsizes = get_paca()->context.low_slices_psize;
index = GET_LOW_SLICE_INDEX(addr);
- } else {
- slices = get_paca()->context.high_slices_psize;
- index = GET_HIGH_SLICE_INDEX(addr);
+ return (lpsizes >> (index * 4)) & 0xF;
}
- return (slices >> (index * 4)) & 0xF;
+ hpsizes = get_paca()->context.high_slices_psize;
+ index = GET_HIGH_SLICE_INDEX(addr);
+ mask_index = index & 0x1;
+ return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF;
}
#else
@@ -852,7 +939,7 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea)
if (ea >= spt->maxaddr)
return 0;
- if (ea < 0x100000000) {
+ if (ea < 0x100000000UL) {
/* addresses below 4GB use spt->low_prot */
sbpm = spt->low_prot;
} else {
@@ -882,14 +969,30 @@ static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
void hash_failure_debug(unsigned long ea, unsigned long access,
unsigned long vsid, unsigned long trap,
- int ssize, int psize, unsigned long pte)
+ int ssize, int psize, int lpsize, unsigned long pte)
{
if (!printk_ratelimit())
return;
pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
ea, access, current->comm);
- pr_info(" trap=0x%lx vsid=0x%lx ssize=%d psize=%d pte=0x%lx\n",
- trap, vsid, ssize, psize, pte);
+ pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
+ trap, vsid, ssize, psize, lpsize, pte);
+}
+
+static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
+ int psize, bool user_region)
+{
+ if (user_region) {
+ if (psize != get_paca_psize(ea)) {
+ get_paca()->context = mm->context;
+ slb_flush_and_rebolt();
+ }
+ } else if (get_paca()->vmalloc_sllp !=
+ mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+ get_paca()->vmalloc_sllp =
+ mmu_psize_defs[mmu_vmalloc_psize].sllp;
+ slb_vmalloc_update();
+ }
}
/* Result code is:
@@ -900,6 +1003,7 @@ void hash_failure_debug(unsigned long ea, unsigned long access,
*/
int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
{
+ enum ctx_state prev_state = exception_enter();
pgd_t *pgdir;
unsigned long vsid;
struct mm_struct *mm;
@@ -912,11 +1016,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
ea, access, trap);
- if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) {
- DBG_LOW(" out of pgtable range !\n");
- return 1;
- }
-
/* Get region & vsid */
switch (REGION_ID(ea)) {
case USER_REGION_ID:
@@ -924,7 +1023,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
mm = current->mm;
if (! mm) {
DBG_LOW(" user region with no mm !\n");
- return 1;
+ rc = 1;
+ goto bail;
}
psize = get_slice_psize(mm, ea);
ssize = user_segment_size(ea);
@@ -943,14 +1043,23 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
/* Not a valid range
* Send the problem up to do_page_fault
*/
- return 1;
+ rc = 1;
+ goto bail;
}
DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
+ /* Bad address. */
+ if (!vsid) {
+ DBG_LOW("Bad address!\n");
+ rc = 1;
+ goto bail;
+ }
/* Get pgdir */
pgdir = mm->pgd;
- if (pgdir == NULL)
- return 1;
+ if (pgdir == NULL) {
+ rc = 1;
+ goto bail;
+ }
/* Check CPU locality */
tmp = cpumask_of(smp_processor_id());
@@ -973,7 +1082,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
- return 1;
+ rc = 1;
+ goto bail;
}
/* Add _PAGE_PRESENT to the required access perm */
@@ -984,14 +1094,32 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
*/
if (access & ~pte_val(*ptep)) {
DBG_LOW(" no access !\n");
- return 1;
+ rc = 1;
+ goto bail;
}
+ if (hugeshift) {
+ if (pmd_trans_huge(*(pmd_t *)ptep))
+ rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+ trap, local, ssize, psize);
#ifdef CONFIG_HUGETLB_PAGE
- if (hugeshift)
- return __hash_page_huge(ea, access, vsid, ptep, trap, local,
- ssize, hugeshift, psize);
-#endif /* CONFIG_HUGETLB_PAGE */
+ else
+ rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+ local, ssize, hugeshift, psize);
+#else
+ else {
+ /*
+ * if we have hugeshift, and is not transhuge with
+ * hugetlb disabled, something is really wrong.
+ */
+ rc = 1;
+ WARN_ON(1);
+ }
+#endif
+ check_paca_psize(ea, mm, psize, user_region);
+
+ goto bail;
+ }
#ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
@@ -1030,17 +1158,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
#endif
}
}
- if (user_region) {
- if (psize != get_paca_psize(ea)) {
- get_paca()->context = mm->context;
- slb_flush_and_rebolt();
- }
- } else if (get_paca()->vmalloc_sllp !=
- mmu_psize_defs[mmu_vmalloc_psize].sllp) {
- get_paca()->vmalloc_sllp =
- mmu_psize_defs[mmu_vmalloc_psize].sllp;
- slb_vmalloc_update();
- }
+
+ check_paca_psize(ea, mm, psize, user_region);
#endif /* CONFIG_PPC_64K_PAGES */
#ifdef CONFIG_PPC_HAS_HASH_64K
@@ -1062,7 +1181,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
*/
if (rc == -1)
hash_failure_debug(ea, access, vsid, trap, ssize, psize,
- pte_val(*ptep));
+ psize, pte_val(*ptep));
#ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
#else
@@ -1070,6 +1189,9 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
pte_val(*(ptep + PTRS_PER_PTE)));
#endif
DBG_LOW(" -> rc=%d\n", rc);
+
+bail:
+ exception_exit(prev_state);
return rc;
}
EXPORT_SYMBOL_GPL(hash_page);
@@ -1077,6 +1199,7 @@ EXPORT_SYMBOL_GPL(hash_page);
void hash_preload(struct mm_struct *mm, unsigned long ea,
unsigned long access, unsigned long trap)
{
+ int hugepage_shift;
unsigned long vsid;
pgd_t *pgdir;
pte_t *ptep;
@@ -1098,10 +1221,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
pgdir = mm->pgd;
if (pgdir == NULL)
return;
- ptep = find_linux_pte(pgdir, ea);
- if (!ptep)
+
+ /* Get VSID */
+ ssize = user_segment_size(ea);
+ vsid = get_vsid(mm->context.id, ea, ssize);
+ if (!vsid)
return;
+ /*
+ * Hash doesn't like irqs. Walking linux page table with irq disabled
+ * saves us from holding multiple locks.
+ */
+ local_irq_save(flags);
+
+ /*
+ * THP pages use update_mmu_cache_pmd. We don't do
+ * hash preload there. Hence can ignore THP here
+ */
+ ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
+ if (!ptep)
+ goto out_exit;
+ WARN_ON(hugepage_shift);
#ifdef CONFIG_PPC_64K_PAGES
/* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
* a 64K kernel), then we don't preload, hash_page() will take
@@ -1110,16 +1250,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
* page size demotion here
*/
if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
- return;
+ goto out_exit;
#endif /* CONFIG_PPC_64K_PAGES */
- /* Get VSID */
- ssize = user_segment_size(ea);
- vsid = get_vsid(mm->context.id, ea, ssize);
-
- /* Hash doesn't like irqs */
- local_irq_save(flags);
-
/* Is that local to this CPU ? */
if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
local = 1;
@@ -1138,30 +1271,52 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
*/
if (rc == -1)
hash_failure_debug(ea, access, vsid, trap, ssize,
- mm->context.user_psize, pte_val(*ptep));
-
+ mm->context.user_psize,
+ mm->context.user_psize,
+ pte_val(*ptep));
+out_exit:
local_irq_restore(flags);
}
/* WARNING: This is called from hash_low_64.S, if you change this prototype,
* do not forget to update the assembly call site !
*/
-void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize,
+void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
int local)
{
unsigned long hash, index, shift, hidx, slot;
- DBG_LOW("flush_hash_page(va=%016lx)\n", va);
- pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
- hash = hpt_hash(va, shift, ssize);
+ DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
+ pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+ hash = hpt_hash(vpn, shift, ssize);
hidx = __rpte_to_hidx(pte, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
- ppc_md.hpte_invalidate(slot, va, psize, ssize, local);
+ /*
+ * We use same base page size and actual psize, because we don't
+ * use these functions for hugepage
+ */
+ ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
} pte_iterate_hashed_end();
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ /* Transactions are not aborted by tlbiel, only tlbie.
+ * Without, syncing a page back to a block device w/ PIO could pick up
+ * transactional data (bad!) so we force an abort here. Before the
+ * sync the page will be made read-only, which will flush_hash_page.
+ * BIG ISSUE here: if the kernel uses a page from userspace without
+ * unmapping it first, it may see the speculated version.
+ */
+ if (local && cpu_has_feature(CPU_FTR_TM) &&
+ current->thread.regs &&
+ MSR_TM_ACTIVE(current->thread.regs->msr)) {
+ tm_enable();
+ tm_abort(TM_CAUSE_TLBI);
+ }
+#endif
}
void flush_hash_range(unsigned long number, int local)
@@ -1174,7 +1329,7 @@ void flush_hash_range(unsigned long number, int local)
&__get_cpu_var(ppc64_tlb_batch);
for (i = 0; i < number; i++)
- flush_hash_page(batch->vaddr[i], batch->pte[i],
+ flush_hash_page(batch->vpn[i], batch->pte[i],
batch->psize, batch->ssize, local);
}
}
@@ -1185,6 +1340,8 @@ void flush_hash_range(unsigned long number, int local)
*/
void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
{
+ enum ctx_state prev_state = exception_enter();
+
if (user_mode(regs)) {
#ifdef CONFIG_PPC_SUBPAGE_PROT
if (rc == -2)
@@ -1194,23 +1351,64 @@ void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
_exception(SIGBUS, regs, BUS_ADRERR, address);
} else
bad_page_fault(regs, address, SIGBUS);
+
+ exception_exit(prev_state);
+}
+
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+ unsigned long pa, unsigned long rflags,
+ unsigned long vflags, int psize, int ssize)
+{
+ unsigned long hpte_group;
+ long slot;
+
+repeat:
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+
+ /* Insert into the hash table, primary slot */
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
+ psize, psize, ssize);
+
+ /* Primary is full, try the secondary */
+ if (unlikely(slot == -1)) {
+ hpte_group = ((~hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
+ vflags | HPTE_V_SECONDARY,
+ psize, psize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP)&~0x7UL;
+
+ ppc_md.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+
+ return slot;
}
#ifdef CONFIG_DEBUG_PAGEALLOC
static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
{
- unsigned long hash, hpteg;
+ unsigned long hash;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
- int ret;
+ long ret;
+
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+
+ /* Don't create HPTE entries for bad address */
+ if (!vsid)
+ return;
- hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
- hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+ ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+ HPTE_V_BOLTED,
+ mmu_linear_psize, mmu_kernel_ssize);
- ret = ppc_md.hpte_insert(hpteg, va, __pa(vaddr),
- mode, HPTE_V_BOLTED,
- mmu_linear_psize, mmu_kernel_ssize);
BUG_ON (ret < 0);
spin_lock(&linear_map_hash_lock);
BUG_ON(linear_map_hash_slots[lmi] & 0x80);
@@ -1222,9 +1420,9 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
{
unsigned long hash, hidx, slot;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
- hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
spin_lock(&linear_map_hash_lock);
BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
hidx = linear_map_hash_slots[lmi] & 0x7f;
@@ -1234,7 +1432,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
- ppc_md.hpte_invalidate(slot, va, mmu_linear_psize, mmu_kernel_ssize, 0);
+ ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
+ mmu_kernel_ssize, 0);
}
void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 00000000000..826893fcb3a
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+ pmd_t *pmdp, unsigned long trap, int local, int ssize,
+ unsigned int psize)
+{
+ unsigned int index, valid;
+ unsigned char *hpte_slot_array;
+ unsigned long rflags, pa, hidx;
+ unsigned long old_pmd, new_pmd;
+ int ret, lpsize = MMU_PAGE_16M;
+ unsigned long vpn, hash, shift, slot;
+
+ /*
+ * atomically mark the linux large page PMD busy and dirty
+ */
+ do {
+ old_pmd = pmd_val(*pmdp);
+ /* If PMD busy, retry the access */
+ if (unlikely(old_pmd & _PAGE_BUSY))
+ return 0;
+ /* If PMD is trans splitting retry the access */
+ if (unlikely(old_pmd & _PAGE_SPLITTING))
+ return 0;
+ /* If PMD permissions don't match, take page fault */
+ if (unlikely(access & ~old_pmd))
+ return 1;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access
+ */
+ new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_RW)
+ new_pmd |= _PAGE_DIRTY;
+ } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
+ old_pmd, new_pmd));
+ /*
+ * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+ * need to add in 0x1 if it's a read-only user page
+ */
+ rflags = new_pmd & _PAGE_USER;
+ if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
+ (new_pmd & _PAGE_DIRTY)))
+ rflags |= 0x1;
+ /*
+ * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+ */
+ rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
+
+#if 0
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+ }
+#endif
+ /*
+ * Find the slot index details for this ea, using base page size.
+ */
+ shift = mmu_psize_defs[psize].shift;
+ index = (ea & ~HPAGE_PMD_MASK) >> shift;
+ BUG_ON(index >= 4096);
+
+ vpn = hpt_vpn(ea, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ hpte_slot_array = get_hpte_slot_array(pmdp);
+
+ valid = hpte_valid(hpte_slot_array, index);
+ if (valid) {
+ /* update the hpte bits */
+ hidx = hpte_hash_index(hpte_slot_array, index);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+ psize, lpsize, ssize, local);
+ /*
+ * We failed to update, try to insert a new entry.
+ */
+ if (ret == -1) {
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ valid = 0;
+ new_pmd &= ~_PAGE_HPTEFLAGS;
+ hpte_slot_array[index] = 0;
+ } else
+ /* clear the busy bits and set the hash pte bits */
+ new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+ }
+
+ if (!valid) {
+ unsigned long hpte_group;
+
+ /* insert new entry */
+ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+repeat:
+ hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+ /* clear the busy bits and set the hash pte bits */
+ new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+
+ /* Add in WIMG bits */
+ rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+ _PAGE_GUARDED));
+ /*
+ * enable the memory coherence always
+ */
+ rflags |= HPTE_R_M;
+
+ /* Insert into the hash table, primary slot */
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ psize, lpsize, ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ hpte_group = ((~hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+ rflags, HPTE_V_SECONDARY,
+ psize, lpsize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+
+ ppc_md.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pmd and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *pmdp = __pmd(old_pmd);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ psize, lpsize, old_pmd);
+ return -1;
+ }
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ mark_hpte_slot_valid(hpte_slot_array, index, slot);
+ }
+ /*
+ * No need to use ldarx/stdcx here
+ */
+ *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+ return 0;
+}
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 3bc700655fc..5e4ee257390 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -8,6 +8,44 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC64
+static inline int tlb1_next(void)
+{
+ struct paca_struct *paca = get_paca();
+ struct tlb_core_data *tcd;
+ int this, next;
+
+ tcd = paca->tcd_ptr;
+ this = tcd->esel_next;
+
+ next = this + 1;
+ if (next >= tcd->esel_max)
+ next = tcd->esel_first;
+
+ tcd->esel_next = next;
+ return this;
+}
+#else
+static inline int tlb1_next(void)
+{
+ int index, ncams;
+
+ ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+ index = __get_cpu_var(next_tlbcam_idx);
+
+ /* Just round-robin the entries and wrap when we hit the end */
+ if (unlikely(index == ncams - 1))
+ __get_cpu_var(next_tlbcam_idx) = tlbcam_index;
+ else
+ __get_cpu_var(next_tlbcam_idx)++;
+
+ return index;
+}
+#endif /* !PPC64 */
+#endif /* FSL */
+
static inline int mmu_get_tsize(int psize)
{
return mmu_psize_defs[psize].enc;
@@ -47,7 +85,7 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
struct mm_struct *mm;
#ifdef CONFIG_PPC_FSL_BOOK3E
- int index, ncams;
+ int index;
#endif
if (unlikely(is_kernel_addr(ea)))
@@ -77,18 +115,11 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
}
#ifdef CONFIG_PPC_FSL_BOOK3E
- ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
-
/* We have to use the CAM(TLB1) on FSL parts for hugepages */
- index = __get_cpu_var(next_tlbcam_idx);
+ index = tlb1_next();
mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
-
- /* Just round-robin the entries and wrap when we hit the end */
- if (unlikely(index == ncams - 1))
- __get_cpu_var(next_tlbcam_idx) = tlbcam_index;
- else
- __get_cpu_var(next_tlbcam_idx)++;
#endif
+
mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
mas2 = ea & ~((1UL << shift) - 1);
mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
@@ -103,7 +134,8 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
mtspr(SPRN_MAS7_MAS3, mas7_3);
} else {
- mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+ if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+ mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
}
@@ -117,6 +149,5 @@ void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
struct hstate *hstate = hstate_file(vma->vm_file);
unsigned long tsize = huge_page_shift(hstate) - 10;
- __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0);
-
+ __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0);
}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index cc5c273086c..a5bcf930119 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -14,18 +14,23 @@
#include <asm/cacheflush.h>
#include <asm/machdep.h>
+extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+ unsigned long pa, unsigned long rlags,
+ unsigned long vflags, int psize, int ssize);
+
int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, int local, int ssize,
unsigned int shift, unsigned int mmu_psize)
{
+ unsigned long vpn;
unsigned long old_pte, new_pte;
- unsigned long va, rflags, pa, sz;
+ unsigned long rflags, pa, sz;
long slot;
BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
/* Search the Linux page table for a match with va */
- va = hpt_va(ea, vsid, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
/* At this point, we have a pte (old_pte) which can be used to build
* or update an HPTE. There are 2 cases:
@@ -69,27 +74,22 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
/* There MIGHT be an HPTE for this pte */
unsigned long hash, slot;
- hash = hpt_hash(va, shift, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
if (old_pte & _PAGE_F_SECOND)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += (old_pte & _PAGE_F_GIX) >> 12;
- if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
- ssize, local) == -1)
+ if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
+ mmu_psize, ssize, local) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
if (likely(!(old_pte & _PAGE_HASHPTE))) {
- unsigned long hash = hpt_hash(va, shift, ssize);
- unsigned long hpte_group;
+ unsigned long hash = hpt_hash(vpn, shift, ssize);
pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-repeat:
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
-
/* clear HPTE slot informations in new PTE */
#ifdef CONFIG_PPC_64K_PAGES
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
@@ -99,27 +99,13 @@ repeat:
/* Add in WIMG bits */
rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
_PAGE_COHERENT | _PAGE_GUARDED));
+ /*
+ * enable the memory coherence always
+ */
+ rflags |= HPTE_R_M;
- /* Insert into the hash table, primary slot */
- slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
- mmu_psize, ssize);
-
- /* Primary is full, try the secondary */
- if (unlikely(slot == -1)) {
- hpte_group = ((~hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
- slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
- HPTE_V_SECONDARY,
- mmu_psize, ssize);
- if (slot == -1) {
- if (mftb() & 0x1)
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP)&~0x7UL;
-
- ppc_md.hpte_remove(hpte_group);
- goto repeat;
- }
- }
+ slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
+ mmu_psize, ssize);
/*
* Hypervisor failure. Restore old pte and return -1
@@ -128,7 +114,7 @@ repeat:
if (unlikely(slot == -2)) {
*ptep = __pte(old_pte);
hash_failure_debug(ea, access, vsid, trap, ssize,
- mmu_psize, old_pte);
+ mmu_psize, mmu_psize, old_pte);
return -1;
}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a8b3cc7d90f..7e70ae968e5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -12,6 +12,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
+#include <linux/export.h>
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
@@ -20,6 +21,9 @@
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/setup.h>
+#include <asm/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE
#define PAGE_SHIFT_64K 16
#define PAGE_SHIFT_16M 24
@@ -47,65 +51,61 @@ static u64 gpage_freearray[MAX_NUMBER_GPAGES];
static unsigned nr_gpages;
#endif
-static inline int shift_to_mmu_psize(unsigned int shift)
-{
- int psize;
+#define hugepd_none(hpd) ((hpd).pd == 0)
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
- if (mmu_psize_defs[psize].shift == shift)
- return psize;
- return -1;
-}
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
-static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
+/*
+ * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
+ * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
+ */
+int pmd_huge(pmd_t pmd)
{
- if (mmu_psize_defs[mmu_psize].shift)
- return mmu_psize_defs[mmu_psize].shift;
- BUG();
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pmd_val(pmd) & 0x3) != 0x0);
}
-#define hugepd_none(hpd) ((hpd).pd == 0)
-
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+int pud_huge(pud_t pud)
{
- pgd_t *pg;
- pud_t *pu;
- pmd_t *pm;
- hugepd_t *hpdp = NULL;
- unsigned pdshift = PGDIR_SHIFT;
-
- if (shift)
- *shift = 0;
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pud_val(pud) & 0x3) != 0x0);
+}
- pg = pgdir + pgd_index(ea);
- if (is_hugepd(pg)) {
- hpdp = (hugepd_t *)pg;
- } else if (!pgd_none(*pg)) {
- pdshift = PUD_SHIFT;
- pu = pud_offset(pg, ea);
- if (is_hugepd(pu))
- hpdp = (hugepd_t *)pu;
- else if (!pud_none(*pu)) {
- pdshift = PMD_SHIFT;
- pm = pmd_offset(pu, ea);
- if (is_hugepd(pm))
- hpdp = (hugepd_t *)pm;
- else if (!pmd_none(*pm)) {
- return pte_offset_kernel(pm, ea);
- }
- }
- }
+int pgd_huge(pgd_t pgd)
+{
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pgd_val(pgd) & 0x3) != 0x0);
+}
+#else
+int pmd_huge(pmd_t pmd)
+{
+ return 0;
+}
- if (!hpdp)
- return NULL;
+int pud_huge(pud_t pud)
+{
+ return 0;
+}
- if (shift)
- *shift = hugepd_shift(*hpdp);
- return hugepte_offset(hpdp, ea, pdshift);
+int pgd_huge(pgd_t pgd)
+{
+ return 0;
}
+#endif
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
+ /* Only called for hugetlbfs pages, hence can ignore THP */
return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
}
@@ -143,6 +143,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
if (unlikely(!hugepd_none(*hpdp)))
break;
else
+ /* We use the old format for PPC_FSL_BOOK3E */
hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
}
/* If we bailed from the for loop early, an error occurred, clean up */
@@ -154,9 +155,15 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
#else
if (!hugepd_none(*hpdp))
kmem_cache_free(cachep, new);
- else
+ else {
+#ifdef CONFIG_PPC_BOOK3S_64
+ hpdp->pd = (unsigned long)new |
+ (shift_to_mmu_psize(pshift) << 2);
+#else
hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
#endif
+ }
+#endif
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -173,6 +180,61 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+{
+ pgd_t *pg;
+ pud_t *pu;
+ pmd_t *pm;
+ hugepd_t *hpdp = NULL;
+ unsigned pshift = __ffs(sz);
+ unsigned pdshift = PGDIR_SHIFT;
+
+ addr &= ~(sz-1);
+ pg = pgd_offset(mm, addr);
+
+ if (pshift == PGDIR_SHIFT)
+ /* 16GB huge page */
+ return (pte_t *) pg;
+ else if (pshift > PUD_SHIFT)
+ /*
+ * We need to use hugepd table
+ */
+ hpdp = (hugepd_t *)pg;
+ else {
+ pdshift = PUD_SHIFT;
+ pu = pud_alloc(mm, pg, addr);
+ if (pshift == PUD_SHIFT)
+ return (pte_t *)pu;
+ else if (pshift > PMD_SHIFT)
+ hpdp = (hugepd_t *)pu;
+ else {
+ pdshift = PMD_SHIFT;
+ pm = pmd_alloc(mm, pu, addr);
+ if (pshift == PMD_SHIFT)
+ /* 16MB hugepage */
+ return (pte_t *)pm;
+ else
+ hpdp = (hugepd_t *)pm;
+ }
+ }
+ if (!hpdp)
+ return NULL;
+
+ BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+ if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+ return NULL;
+
+ return hugepte_offset(hpdp, addr, pdshift);
+}
+
+#else
+
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
pgd_t *pg;
@@ -210,6 +272,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
return hugepte_offset(hpdp, addr, pdshift);
}
+#endif
#ifdef CONFIG_PPC_FSL_BOOK3E
/* Build list of addresses of gigantic pages. This function is used in early
@@ -238,7 +301,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
int alloc_bootmem_huge_page(struct hstate *hstate)
{
struct huge_bootmem_page *m;
- int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
+ int idx = shift_to_mmu_psize(huge_page_shift(hstate));
int nr_gpages = gpage_freearray[idx].nr_gpages;
if (nr_gpages == 0)
@@ -269,7 +332,8 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
unsigned long gpage_npages[MMU_PAGE_COUNT];
-static int __init do_gpage_early_setup(char *param, char *val)
+static int __init do_gpage_early_setup(char *param, char *val,
+ const char *unused)
{
static phys_addr_t size;
unsigned long npages;
@@ -310,7 +374,8 @@ void __init reserve_hugetlb_gpages(void)
int i;
strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
- parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
+ parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
+ &do_gpage_early_setup);
/*
* Walk gpage list in reverse, allocating larger page sizes first.
@@ -397,12 +462,13 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
struct hugepd_freelist **batchp;
- batchp = &__get_cpu_var(hugepd_freelist_cur);
+ batchp = &get_cpu_var(hugepd_freelist_cur);
if (atomic_read(&tlb->mm->mm_users) < 2 ||
cpumask_equal(mm_cpumask(tlb->mm),
cpumask_of(smp_processor_id()))) {
kmem_cache_free(hugepte_cache, hugepte);
+ put_cpu_var(hugepd_freelist_cur);
return;
}
@@ -416,6 +482,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
*batchp = NULL;
}
+ put_cpu_var(hugepd_freelist_cur);
}
#endif
@@ -471,8 +538,14 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
do {
pmd = pmd_offset(pud, addr);
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd))
+ if (!is_hugepd(pmd)) {
+ /*
+ * if it is not hugepd pointer, we should already find
+ * it cleared.
+ */
+ WARN_ON(!pmd_none_or_clear_bad(pmd));
continue;
+ }
#ifdef CONFIG_PPC_FSL_BOOK3E
/*
* Increment next by the size of the huge mapping since
@@ -552,8 +625,6 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
/*
* This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
*/
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -609,11 +680,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
struct page *page;
unsigned shift;
unsigned long mask;
-
+ /*
+ * Transparent hugepages are handled by generic code. We can skip them
+ * here.
+ */
ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
/* Verify it is a huge page else bail. */
- if (!ptep || !shift)
+ if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
return ERR_PTR(-EINVAL);
mask = (1UL << shift) - 1;
@@ -624,16 +698,6 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
return page;
}
-int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
-
-int pud_huge(pud_t pud)
-{
- return 0;
-}
-
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
@@ -642,69 +706,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
return NULL;
}
-static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- unsigned long mask;
- unsigned long pte_end;
- struct page *head, *page, *tail;
- pte_t pte;
- int refs;
-
- pte_end = (addr + sz) & ~(sz-1);
- if (pte_end < end)
- end = pte_end;
-
- pte = *ptep;
- mask = _PAGE_PRESENT | _PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
-
- if ((pte_val(pte) & mask) != mask)
- return 0;
-
- /* hugepages are never "special" */
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
- refs = 0;
- head = pte_page(pte);
-
- page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- tail = page;
- do {
- VM_BUG_ON(compound_head(page) != head);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
-
- if (!page_cache_add_speculative(head, refs)) {
- *nr -= refs;
- return 0;
- }
-
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- /* Could be optimized better */
- *nr -= refs;
- while (refs--)
- put_page(head);
- return 0;
- }
-
- /*
- * Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
- return 1;
-}
-
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
{
@@ -738,7 +739,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct hstate *hstate = hstate_file(file);
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
- return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
+ return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
}
#endif
@@ -879,11 +880,16 @@ static int __init hugetlbpage_init(void)
pdshift = PUD_SHIFT;
else
pdshift = PGDIR_SHIFT;
-
- pgtable_cache_add(pdshift - shift, NULL);
- if (!PGT_CACHE(pdshift - shift))
- panic("hugetlbpage_init(): could not create "
- "pgtable cache for %d bit pagesize\n", shift);
+ /*
+ * if we have pdshift and shift value same, we don't
+ * use pgt cache for hugepd.
+ */
+ if (pdshift != shift) {
+ pgtable_cache_add(pdshift - shift, NULL);
+ if (!PGT_CACHE(pdshift - shift))
+ panic("hugetlbpage_init(): could not create "
+ "pgtable cache for %d bit pagesize\n", shift);
+ }
}
/* Set default large page size. Currently, we pick 16M or 1M
@@ -910,9 +916,174 @@ void flush_dcache_icache_hugepage(struct page *page)
if (!PageHighMem(page)) {
__flush_dcache_icache(page_address(page+i));
} else {
- start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE);
+ start = kmap_atomic(page+i);
__flush_dcache_icache(start);
- kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+ kunmap_atomic(start);
+ }
+ }
+}
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page, bottom two bits != 00
+ * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+{
+ pgd_t pgd, *pgdp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t *ret_pte;
+ hugepd_t *hpdp = NULL;
+ unsigned pdshift = PGDIR_SHIFT;
+
+ if (shift)
+ *shift = 0;
+
+ pgdp = pgdir + pgd_index(ea);
+ pgd = ACCESS_ONCE(*pgdp);
+ /*
+ * Always operate on the local stack value. This make sure the
+ * value don't get updated by a parallel THP split/collapse,
+ * page fault or a page unmap. The return pte_t * is still not
+ * stable. So should be checked there for above conditions.
+ */
+ if (pgd_none(pgd))
+ return NULL;
+ else if (pgd_huge(pgd)) {
+ ret_pte = (pte_t *) pgdp;
+ goto out;
+ } else if (is_hugepd(&pgd))
+ hpdp = (hugepd_t *)&pgd;
+ else {
+ /*
+ * Even if we end up with an unmap, the pgtable will not
+ * be freed, because we do an rcu free and here we are
+ * irq disabled
+ */
+ pdshift = PUD_SHIFT;
+ pudp = pud_offset(&pgd, ea);
+ pud = ACCESS_ONCE(*pudp);
+
+ if (pud_none(pud))
+ return NULL;
+ else if (pud_huge(pud)) {
+ ret_pte = (pte_t *) pudp;
+ goto out;
+ } else if (is_hugepd(&pud))
+ hpdp = (hugepd_t *)&pud;
+ else {
+ pdshift = PMD_SHIFT;
+ pmdp = pmd_offset(&pud, ea);
+ pmd = ACCESS_ONCE(*pmdp);
+ /*
+ * A hugepage collapse is captured by pmd_none, because
+ * it mark the pmd none and do a hpte invalidate.
+ *
+ * A hugepage split is captured by pmd_trans_splitting
+ * because we mark the pmd trans splitting and do a
+ * hpte invalidate
+ *
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ return NULL;
+
+ if (pmd_huge(pmd) || pmd_large(pmd)) {
+ ret_pte = (pte_t *) pmdp;
+ goto out;
+ } else if (is_hugepd(&pmd))
+ hpdp = (hugepd_t *)&pmd;
+ else
+ return pte_offset_kernel(&pmd, ea);
}
}
+ if (!hpdp)
+ return NULL;
+
+ ret_pte = hugepte_offset(hpdp, ea, pdshift);
+ pdshift = hugepd_shift(*hpdp);
+out:
+ if (shift)
+ *shift = pdshift;
+ return ret_pte;
+}
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+
+int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ unsigned long pte_end;
+ struct page *head, *page, *tail;
+ pte_t pte;
+ int refs;
+
+ pte_end = (addr + sz) & ~(sz-1);
+ if (pte_end < end)
+ end = pte_end;
+
+ pte = ACCESS_ONCE(*ptep);
+ mask = _PAGE_PRESENT | _PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * check for splitting here
+ */
+ if (pmd_trans_splitting(pte_pmd(pte)))
+ return 0;
+#endif
+
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+
+ page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+ tail = page;
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ /* Could be optimized better */
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ /*
+ * Any tail page need their mapcount reference taken before we
+ * return.
+ */
+ while (refs--) {
+ if (PageTail(tail))
+ get_huge_page_tail(tail);
+ tail++;
+ }
+
+ return 1;
}
diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c
index 5d9a59eaad9..915412e4d5b 100644
--- a/arch/powerpc/mm/icswx.c
+++ b/arch/powerpc/mm/icswx.c
@@ -67,7 +67,7 @@
void switch_cop(struct mm_struct *next)
{
-#ifdef CONFIG_ICSWX_PID
+#ifdef CONFIG_PPC_ICSWX_PID
mtspr(SPRN_PID, next->context.cop_pid);
#endif
mtspr(SPRN_ACOP, next->context.acop);
@@ -163,7 +163,7 @@ EXPORT_SYMBOL_GPL(drop_cop);
static int acop_use_cop(int ct)
{
- /* todo */
+ /* There is no alternate policy, yet */
return -1;
}
@@ -227,11 +227,30 @@ int acop_handle_fault(struct pt_regs *regs, unsigned long address,
ct = (ccw >> 16) & 0x3f;
}
+ /*
+ * We could be here because another thread has enabled acop
+ * but the ACOP register has yet to be updated.
+ *
+ * This should have been taken care of by the IPI to sync all
+ * the threads (see smp_call_function(sync_cop, mm, 1)), but
+ * that could take forever if there are a significant amount
+ * of threads.
+ *
+ * Given the number of threads on some of these systems,
+ * perhaps this is the best way to sync ACOP rather than whack
+ * every thread with an IPI.
+ */
+ if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) {
+ sync_cop(current->active_mm);
+ return 0;
+ }
+
+ /* check for alternate policy */
if (!acop_use_cop(ct))
return 0;
/* at this point the CT is unknown to the system */
- pr_warn("%s[%d]: Coprocessor %d is unavailable",
+ pr_warn("%s[%d]: Coprocessor %d is unavailable\n",
current->comm, current->pid, ct);
/* get inst if we don't already have it */
diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h
index 42176bd0884..6dedc08e62c 100644
--- a/arch/powerpc/mm/icswx.h
+++ b/arch/powerpc/mm/icswx.h
@@ -59,4 +59,10 @@ extern void free_cop_pid(int free_pid);
extern int acop_handle_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code);
+
+static inline u64 acop_copro_type_bit(unsigned int type)
+{
+ return 1ULL << (63 - type);
+}
+
#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 6157be2a704..cff59f1bec2 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -45,7 +45,6 @@
#include <asm/btext.h>
#include <asm/tlb.h>
#include <asm/sections.h>
-#include <asm/system.h>
#include <asm/hugetlb.h>
#include "mmu_decl.h"
@@ -53,7 +52,7 @@
#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
-#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
+#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_KERNEL_START"
#endif
#endif
#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE
@@ -214,7 +213,12 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
*/
BUG_ON(first_memblock_base != 0);
+#ifdef CONFIG_PIN_TLB
+ /* 8xx can only access 24MB at the moment */
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000));
+#else
/* 8xx can only access 8MB at the moment */
memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
+#endif
}
#endif /* CONFIG_8xx */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index e94b57fb79a..e3734edffa6 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -61,9 +61,7 @@
#include <asm/mmzone.h>
#include <asm/cputable.h>
#include <asm/sections.h>
-#include <asm/system.h>
#include <asm/iommu.h>
-#include <asm/abs_addr.h>
#include <asm/vdso.h>
#include "mmu_decl.h"
@@ -90,7 +88,11 @@ static void pgd_ctor(void *addr)
static void pmd_ctor(void *addr)
{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ memset(addr, 0, PMD_TABLE_SIZE * 2);
+#else
memset(addr, 0, PMD_TABLE_SIZE);
+#endif
}
struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -131,8 +133,7 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
align = max_t(unsigned long, align, minalign);
name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
new = kmem_cache_create(name, table_size, align, 0, ctor);
- PGT_CACHE(shift) = new;
-
+ pgtable_cache[shift - 1] = new;
pr_debug("Allocated pgtable cache for order %d\n", shift);
}
@@ -140,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
void pgtable_cache_init(void)
{
pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
- pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
- if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+ pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+ if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
panic("Couldn't allocate pgtable caches");
-
/* In all current configs, when the PUD index exists it's the
* same size as either the pgd or pmd index. Verify that the
* initialization above has also created a PUD cache. This
@@ -218,7 +218,8 @@ static void __meminit vmemmap_create_mapping(unsigned long start,
unsigned long phys)
{
int mapped = htab_bolt_mapping(start, start + page_size, phys,
- PAGE_KERNEL, mmu_vmemmap_psize,
+ pgprot_val(PAGE_KERNEL),
+ mmu_vmemmap_psize,
mmu_kernel_ssize);
BUG_ON(mapped < 0);
}
@@ -265,19 +266,14 @@ static __meminit void vmemmap_list_populate(unsigned long phys,
vmemmap_list = vmem_back;
}
-int __meminit vmemmap_populate(struct page *start_page,
- unsigned long nr_pages, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
- unsigned long start = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + nr_pages);
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
/* Align to the page size of the linear mapping. */
start = _ALIGN_DOWN(start, page_size);
- pr_debug("vmemmap_populate page %p, %ld pages, node %d\n",
- start_page, nr_pages, node);
- pr_debug(" -> map %lx..%lx\n", start, end);
+ pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
for (; start < end; start += page_size) {
void *p;
@@ -299,5 +295,63 @@ int __meminit vmemmap_populate(struct page *start_page,
return 0;
}
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+void vmemmap_free(unsigned long start, unsigned long end)
+{
+}
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+}
+
+/*
+ * We do not have access to the sparsemem vmemmap, so we fallback to
+ * walking the list of sparsemem blocks which we already maintain for
+ * the sake of crashdump. In the long run, we might want to maintain
+ * a tree if performance of that linear walk becomes a problem.
+ *
+ * realmode_pfn_to_page functions can fail due to:
+ * 1) As real sparsemem blocks do not lay in RAM continously (they
+ * are in virtual address space which is not available in the real mode),
+ * the requested page struct can be split between blocks so get_page/put_page
+ * may fail.
+ * 2) When huge pages are used, the get_page/put_page API will fail
+ * in real mode as the linked addresses in the page struct are virtual
+ * too.
+ */
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+ struct vmemmap_backing *vmem_back;
+ struct page *page;
+ unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
+ unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
+
+ for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
+ if (pg_va < vmem_back->virt_addr)
+ continue;
+
+ /* Check that page struct is not split between real pages */
+ if ((pg_va + sizeof(struct page)) >
+ (vmem_back->virt_addr + page_size))
+ return NULL;
+
+ page = (struct page *) (vmem_back->phys + pg_va -
+ vmem_back->virt_addr);
+ return page;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#elif defined(CONFIG_FLATMEM)
+
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+ return page;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index d974b79a306..2c8e90f5789 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -62,14 +62,13 @@
int init_bootmem_done;
int mem_init_done;
-phys_addr_t memory_limit;
+unsigned long long memory_limit;
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
+EXPORT_SYMBOL(kmap_pte);
pgprot_t kmap_prot;
-
EXPORT_SYMBOL(kmap_prot);
-EXPORT_SYMBOL(kmap_pte);
static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
@@ -133,6 +132,23 @@ int arch_add_memory(int nid, u64 start, u64 size)
return __add_pages(nid, zone, start_pfn, nr_pages);
}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+ int ret;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ ret = __remove_pages(zone, start_pfn, nr_pages);
+ if (!ret && (ppc_md.remove_memory))
+ ret = ppc_md.remove_memory(start, size);
+
+ return ret;
+}
+#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
/*
@@ -195,13 +211,10 @@ void __init do_init_bootmem(void)
min_low_pfn = MEMORY_START >> PAGE_SHIFT;
boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn);
- /* Add active regions with valid PFNs */
- for_each_memblock(memory, reg) {
- unsigned long start_pfn, end_pfn;
- start_pfn = memblock_region_memory_base_pfn(reg);
- end_pfn = memblock_region_memory_end_pfn(reg);
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
- }
+ /* Place all memblock_regions in the same node and merge contiguous
+ * memblock_regions
+ */
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
/* Add all physical memory to the bootmem map, mark each area
* present.
@@ -289,50 +302,30 @@ void __init paging_init(void)
}
#endif /* ! CONFIG_NEED_MULTIPLE_NODES */
+static void __init register_page_bootmem_info(void)
+{
+ int i;
+
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+}
+
void __init mem_init(void)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- int nid;
-#endif
- pg_data_t *pgdat;
- unsigned long i;
- struct page *page;
- unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
+ /*
+ * book3s is limited to 16 page sizes due to encoding this in
+ * a 4-bit field for slices.
+ */
+ BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
#ifdef CONFIG_SWIOTLB
- if (ppc_swiotlb_enable)
- swiotlb_init(1);
+ swiotlb_init(0);
#endif
- num_physpages = memblock_phys_mem_size() >> PAGE_SHIFT;
+ register_page_bootmem_info();
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
-
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- for_each_online_node(nid) {
- if (NODE_DATA(nid)->node_spanned_pages != 0) {
- printk("freeing bootmem node %d\n", nid);
- totalram_pages +=
- free_all_bootmem_node(NODE_DATA(nid));
- }
- }
-#else
- max_mapnr = max_pfn;
- totalram_pages += free_all_bootmem();
-#endif
- for_each_online_pgdat(pgdat) {
- for (i = 0; i < pgdat->node_spanned_pages; i++) {
- if (!pfn_valid(pgdat->node_start_pfn + i))
- continue;
- page = pgdat_page_nr(pgdat, i);
- if (PageReserved(page))
- reservedpages++;
- }
- }
-
- codesize = (unsigned long)&_sdata - (unsigned long)&_stext;
- datasize = (unsigned long)&_edata - (unsigned long)&_sdata;
- initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
- bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
+ set_max_mapnr(max_pfn);
+ free_all_bootmem();
#ifdef CONFIG_HIGHMEM
{
@@ -342,17 +335,9 @@ void __init mem_init(void)
for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
- if (memblock_is_reserved(paddr))
- continue;
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
- reservedpages--;
+ if (!memblock_is_reserved(paddr))
+ free_highmem_page(page);
}
- totalram_pages += totalhigh_pages;
- printk(KERN_DEBUG "High memory: %luk\n",
- totalhigh_pages << (PAGE_SHIFT-10));
}
#endif /* CONFIG_HIGHMEM */
@@ -365,16 +350,7 @@ void __init mem_init(void)
(mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
#endif
- printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
- "%luk reserved, %luk data, %luk bss, %luk init)\n",
- nr_free_pages() << (PAGE_SHIFT-10),
- num_physpages << (PAGE_SHIFT-10),
- codesize >> 10,
- reservedpages << (PAGE_SHIFT-10),
- datasize >> 10,
- bsssize >> 10,
- initsize >> 10);
-
+ mem_init_print_info(NULL);
#ifdef CONFIG_PPC32
pr_info("Kernel virtual memory layout:\n");
pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP);
@@ -397,39 +373,14 @@ void __init mem_init(void)
void free_initmem(void)
{
- unsigned long addr;
-
ppc_md.progress = ppc_printk_progress;
-
- addr = (unsigned long)__init_begin;
- for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- pr_info("Freeing unused kernel memory: %luk freed\n",
- ((unsigned long)__init_end -
- (unsigned long)__init_begin) >> 10);
+ free_initmem_default(POISON_FREE_INITMEM);
}
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start >= end)
- return;
-
- start = _ALIGN_DOWN(start, PAGE_SIZE);
- end = _ALIGN_UP(end, PAGE_SIZE);
- pr_info("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
-
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
+ free_reserved_area((void *)start, (void *)end, -1, "initrd");
}
#endif
@@ -458,9 +409,9 @@ void flush_dcache_icache_page(struct page *page)
#endif
#ifdef CONFIG_BOOKE
{
- void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
+ void *start = kmap_atomic(page);
__flush_dcache_icache(start);
- kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+ kunmap_atomic(start);
}
#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
/* On 8xx there is no need to kmap since highmem is not supported */
@@ -469,6 +420,7 @@ void flush_dcache_icache_page(struct page *page)
__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
#endif
}
+EXPORT_SYMBOL(flush_dcache_icache_page);
void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
{
@@ -529,6 +481,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
#ifdef CONFIG_PPC_STD_MMU
+ /*
+ * We don't need to worry about _PAGE_PRESENT here because we are
+ * called with either mm->page_table_lock held or ptl lock held
+ */
unsigned long access = 0, trap;
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
@@ -562,7 +518,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
* System memory should not be in /proc/iomem but various tools expect it
* (eg kdump).
*/
-static int add_system_ram_resources(void)
+static int __init add_system_ram_resources(void)
{
struct memblock_region *reg;
@@ -578,7 +534,7 @@ static int add_system_ram_resources(void)
res->name = "System RAM";
res->start = base;
res->end = base + size - 1;
- res->flags = IORESOURCE_MEM;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
WARN_ON(request_resource(&iomem_resource, res) < 0);
}
}
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap.c
index 67a42ed0d2f..cb8bdbe4972 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap.c
@@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index 40677aa0190..178876aef40 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -23,19 +23,13 @@
#include <linux/slab.h>
#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
#include "icswx.h"
static DEFINE_SPINLOCK(mmu_context_lock);
static DEFINE_IDA(mmu_context_ida);
-/*
- * The proto-VSID space has 2^35 - 1 segments available for user mappings.
- * Each segment contains 2^28 bytes. Each context maps 2^44 bytes,
- * so we can support 2^19-1 contexts (19 == 35 + 28 - 44).
- */
-#define MAX_CONTEXT ((1UL << 19) - 1)
-
int __init_new_context(void)
{
int index;
@@ -54,7 +48,7 @@ again:
else if (err)
return err;
- if (index > MAX_CONTEXT) {
+ if (index > MAX_USER_CONTEXT) {
spin_lock(&mmu_context_lock);
ida_remove(&mmu_context_ida, index);
spin_unlock(&mmu_context_lock);
@@ -92,6 +86,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
spin_lock_init(mm->context.cop_lockp);
#endif /* CONFIG_PPC_ICSWX */
+#ifdef CONFIG_PPC_64K_PAGES
+ mm->context.pte_frag = NULL;
+#endif
return 0;
}
@@ -103,13 +100,46 @@ void __destroy_context(int context_id)
}
EXPORT_SYMBOL_GPL(__destroy_context);
+#ifdef CONFIG_PPC_64K_PAGES
+static void destroy_pagetable_page(struct mm_struct *mm)
+{
+ int count;
+ void *pte_frag;
+ struct page *page;
+
+ pte_frag = mm->context.pte_frag;
+ if (!pte_frag)
+ return;
+
+ page = virt_to_page(pte_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
+ if (!count) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+#else
+static inline void destroy_pagetable_page(struct mm_struct *mm)
+{
+ return;
+}
+#endif
+
+
void destroy_context(struct mm_struct *mm)
{
+
#ifdef CONFIG_PPC_ICSWX
drop_cop(mm->context.acop, mm);
kfree(mm->context.cop_lockp);
mm->context.cop_lockp = NULL;
#endif /* CONFIG_PPC_ICSWX */
+
+ destroy_pagetable_page(mm);
__destroy_context(mm->context.id);
subpage_prot_free(mm);
mm->context.id = MMU_NO_CONTEXT;
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 5b63bd3da4a..928ebe79668 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id)
*/
for_each_cpu(cpu, mm_cpumask(mm)) {
for (i = cpu_first_thread_sibling(cpu);
- i <= cpu_last_thread_sibling(cpu); i++)
- __set_bit(id, stale_map[i]);
+ i <= cpu_last_thread_sibling(cpu); i++) {
+ if (stale_map[i])
+ __set_bit(id, stale_map[i]);
+ }
cpu = i - 1;
}
return id;
@@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
/* XXX This clear should ultimately be part of local_flush_tlb_mm */
for (i = cpu_first_thread_sibling(cpu);
i <= cpu_last_thread_sibling(cpu); i++) {
- __clear_bit(id, stale_map[i]);
+ if (stale_map[i])
+ __clear_bit(id, stale_map[i]);
}
}
@@ -329,13 +332,11 @@ void destroy_context(struct mm_struct *mm)
#ifdef CONFIG_SMP
-static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int mmu_context_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned int)(long)hcpu;
-#ifdef CONFIG_HOTPLUG_CPU
- struct task_struct *p;
-#endif
+
/* We don't touch CPU 0 map, it's allocated at aboot and kept
* around forever
*/
@@ -358,19 +359,14 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
stale_map[cpu] = NULL;
/* We also clear the cpu_vm_mask bits of CPUs going away */
- read_lock(&tasklist_lock);
- for_each_process(p) {
- if (p->mm)
- cpumask_clear_cpu(cpu, mm_cpumask(p->mm));
- }
- read_unlock(&tasklist_lock);
+ clear_tasks_mm_cpumask(cpu);
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata mmu_context_cpu_nb = {
+static struct notifier_block mmu_context_cpu_nb = {
.notifier_call = mmu_context_cpu_notify,
};
@@ -414,17 +410,7 @@ void __init mmu_context_init(void)
} else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
first_context = 1;
last_context = 65535;
- } else
-#ifdef CONFIG_PPC_BOOK3E_MMU
- if (mmu_has_feature(MMU_FTR_TYPE_3E)) {
- u32 mmucfg = mfspr(SPRN_MMUCFG);
- u32 pid_bits = (mmucfg & MMUCFG_PIDSIZE_MASK)
- >> MMUCFG_PIDSIZE_SHIFT;
- first_context = 1;
- last_context = (1UL << (pid_bits + 1)) - 1;
- } else
-#endif
- {
+ } else {
first_context = 1;
last_context = 255;
}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 83eb5d5f53d..9615d82919b 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -148,6 +148,8 @@ extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
extern void MMU_init_hw(void);
extern unsigned long mmu_mapin_ram(unsigned long top);
extern void adjust_total_lowmem(void);
+extern int switch_to_as1(void);
+extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
#endif
extern void loadcam_entry(unsigned int index);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3feefc3842a..3b181b22cd4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -22,13 +22,22 @@
#include <linux/pfn.h>
#include <linux/cpuset.h>
#include <linux/node.h>
+#include <linux/stop_machine.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <asm/cputhreads.h>
#include <asm/sparsemem.h>
#include <asm/prom.h>
-#include <asm/system.h>
#include <asm/smp.h>
+#include <asm/cputhreads.h>
+#include <asm/topology.h>
#include <asm/firmware.h>
#include <asm/paca.h>
#include <asm/hvcall.h>
+#include <asm/setup.h>
+#include <asm/vdso.h>
static int numa_enabled = 1;
@@ -51,7 +60,7 @@ static int form1_affinity;
#define MAX_DISTANCE_REF_POINTS 4
static int distance_ref_points_depth;
-static const unsigned int *distance_ref_points;
+static const __be32 *distance_ref_points;
static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
/*
@@ -62,14 +71,11 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
*/
static void __init setup_node_to_cpumask_map(void)
{
- unsigned int node, num = 0;
+ unsigned int node;
/* setup nr_node_ids if not done yet */
- if (nr_node_ids == MAX_NUMNODES) {
- for_each_node_mask(node, node_possible_map)
- num = node;
- nr_node_ids = num + 1;
- }
+ if (nr_node_ids == MAX_NUMNODES)
+ setup_nr_node_ids();
/* allocate the map */
for (node = 0; node < nr_node_ids; node++)
@@ -79,7 +85,7 @@ static void __init setup_node_to_cpumask_map(void)
dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
}
-static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
+static int __init fake_numa_create_new_node(unsigned long end_pfn,
unsigned int *nid)
{
unsigned long long mem;
@@ -148,9 +154,22 @@ static void __init get_node_active_region(unsigned long pfn,
}
}
-static void map_cpu_to_node(int cpu, int node)
+static void reset_numa_cpu_lookup_table(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ numa_cpu_lookup_table[cpu] = -1;
+}
+
+static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
{
numa_cpu_lookup_table[cpu] = node;
+}
+
+static void map_cpu_to_node(int cpu, int node)
+{
+ update_numa_cpu_lookup_table(cpu, node);
dbg("adding cpu %d to node %d\n", cpu, node);
@@ -175,7 +194,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
/* must hold reference to node during call */
-static const int *of_get_associativity(struct device_node *dev)
+static const __be32 *of_get_associativity(struct device_node *dev)
{
return of_get_property(dev, "ibm,associativity", NULL);
}
@@ -185,13 +204,13 @@ static const int *of_get_associativity(struct device_node *dev)
* it exists (the property exists only in kexec/kdump kernels,
* added by kexec-tools)
*/
-static const u32 *of_get_usable_memory(struct device_node *memory)
+static const __be32 *of_get_usable_memory(struct device_node *memory)
{
- const u32 *prop;
+ const __be32 *prop;
u32 len;
prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
if (!prop || len < sizeof(unsigned int))
- return 0;
+ return NULL;
return prop;
}
@@ -201,7 +220,7 @@ int __node_distance(int a, int b)
int distance = LOCAL_DISTANCE;
if (!form1_affinity)
- return distance;
+ return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
for (i = 0; i < distance_ref_points_depth; i++) {
if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
@@ -213,9 +232,10 @@ int __node_distance(int a, int b)
return distance;
}
+EXPORT_SYMBOL(__node_distance);
static void initialize_distance_lookup_table(int nid,
- const unsigned int *associativity)
+ const __be32 *associativity)
{
int i;
@@ -223,29 +243,32 @@ static void initialize_distance_lookup_table(int nid,
return;
for (i = 0; i < distance_ref_points_depth; i++) {
- distance_lookup_table[nid][i] =
- associativity[distance_ref_points[i]];
+ const __be32 *entry;
+
+ entry = &associativity[be32_to_cpu(distance_ref_points[i])];
+ distance_lookup_table[nid][i] = of_read_number(entry, 1);
}
}
/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
* info is found.
*/
-static int associativity_to_nid(const unsigned int *associativity)
+static int associativity_to_nid(const __be32 *associativity)
{
int nid = -1;
if (min_common_depth == -1)
goto out;
- if (associativity[0] >= min_common_depth)
- nid = associativity[min_common_depth];
+ if (of_read_number(associativity, 1) >= min_common_depth)
+ nid = of_read_number(&associativity[min_common_depth], 1);
/* POWER4 LPAR uses 0xffff as invalid node */
if (nid == 0xffff || nid >= MAX_NUMNODES)
nid = -1;
- if (nid > 0 && associativity[0] >= distance_ref_points_depth)
+ if (nid > 0 &&
+ of_read_number(associativity, 1) >= distance_ref_points_depth)
initialize_distance_lookup_table(nid, associativity);
out:
@@ -258,7 +281,7 @@ out:
static int of_node_to_nid_single(struct device_node *device)
{
int nid = -1;
- const unsigned int *tmp;
+ const __be32 *tmp;
tmp = of_get_associativity(device);
if (tmp)
@@ -291,9 +314,7 @@ EXPORT_SYMBOL_GPL(of_node_to_nid);
static int __init find_min_common_depth(void)
{
int depth;
- struct device_node *chosen;
struct device_node *root;
- const char *vec5;
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
@@ -325,26 +346,14 @@ static int __init find_min_common_depth(void)
distance_ref_points_depth /= sizeof(int);
-#define VEC5_AFFINITY_BYTE 5
-#define VEC5_AFFINITY 0x80
-
- if (firmware_has_feature(FW_FEATURE_OPAL))
+ if (firmware_has_feature(FW_FEATURE_OPAL) ||
+ firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
+ dbg("Using form 1 affinity\n");
form1_affinity = 1;
- else {
- chosen = of_find_node_by_path("/chosen");
- if (chosen) {
- vec5 = of_get_property(chosen,
- "ibm,architecture-vec-5", NULL);
- if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
- VEC5_AFFINITY)) {
- dbg("Using form 1 affinity\n");
- form1_affinity = 1;
- }
- }
}
if (form1_affinity) {
- depth = distance_ref_points[0];
+ depth = of_read_number(distance_ref_points, 1);
} else {
if (distance_ref_points_depth < 2) {
printk(KERN_WARNING "NUMA: "
@@ -352,7 +361,7 @@ static int __init find_min_common_depth(void)
goto err;
}
- depth = distance_ref_points[1];
+ depth = of_read_number(&distance_ref_points[1], 1);
}
/*
@@ -386,44 +395,32 @@ static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
of_node_put(memory);
}
-static unsigned long read_n_cells(int n, const unsigned int **buf)
+static unsigned long read_n_cells(int n, const __be32 **buf)
{
unsigned long result = 0;
while (n--) {
- result = (result << 32) | **buf;
+ result = (result << 32) | of_read_number(*buf, 1);
(*buf)++;
}
return result;
}
-struct of_drconf_cell {
- u64 base_addr;
- u32 drc_index;
- u32 reserved;
- u32 aa_index;
- u32 flags;
-};
-
-#define DRCONF_MEM_ASSIGNED 0x00000008
-#define DRCONF_MEM_AI_INVALID 0x00000040
-#define DRCONF_MEM_RESERVED 0x00000080
-
/*
* Read the next memblock list entry from the ibm,dynamic-memory property
* and return the information in the provided of_drconf_cell structure.
*/
-static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
+static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp)
{
- const u32 *cp;
+ const __be32 *cp;
drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
cp = *cellp;
- drmem->drc_index = cp[0];
- drmem->reserved = cp[1];
- drmem->aa_index = cp[2];
- drmem->flags = cp[3];
+ drmem->drc_index = of_read_number(cp, 1);
+ drmem->reserved = of_read_number(&cp[1], 1);
+ drmem->aa_index = of_read_number(&cp[2], 1);
+ drmem->flags = of_read_number(&cp[3], 1);
*cellp = cp + 4;
}
@@ -435,16 +432,16 @@ static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
* list entries followed by N memblock list entries. Each memblock list entry
* contains information as laid out in the of_drconf_cell struct above.
*/
-static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
+static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm)
{
- const u32 *prop;
+ const __be32 *prop;
u32 len, entries;
prop = of_get_property(memory, "ibm,dynamic-memory", &len);
if (!prop || len < sizeof(unsigned int))
return 0;
- entries = *prop++;
+ entries = of_read_number(prop++, 1);
/* Now that we know the number of entries, revalidate the size
* of the property read in to ensure we have everything
@@ -462,7 +459,7 @@ static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
*/
static u64 of_get_lmb_size(struct device_node *memory)
{
- const u32 *prop;
+ const __be32 *prop;
u32 len;
prop = of_get_property(memory, "ibm,lmb-size", &len);
@@ -475,7 +472,7 @@ static u64 of_get_lmb_size(struct device_node *memory)
struct assoc_arrays {
u32 n_arrays;
u32 array_sz;
- const u32 *arrays;
+ const __be32 *arrays;
};
/*
@@ -491,15 +488,15 @@ struct assoc_arrays {
static int of_get_assoc_arrays(struct device_node *memory,
struct assoc_arrays *aa)
{
- const u32 *prop;
+ const __be32 *prop;
u32 len;
prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
if (!prop || len < 2 * sizeof(unsigned int))
return -1;
- aa->n_arrays = *prop++;
- aa->array_sz = *prop++;
+ aa->n_arrays = of_read_number(prop++, 1);
+ aa->array_sz = of_read_number(prop++, 1);
/* Now that we know the number of arrays and size of each array,
* revalidate the size of the property read in.
@@ -526,7 +523,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
!(drmem->flags & DRCONF_MEM_AI_INVALID) &&
drmem->aa_index < aa->n_arrays) {
index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
- nid = aa->arrays[index];
+ nid = of_read_number(&aa->arrays[index], 1);
if (nid == 0xffff || nid >= MAX_NUMNODES)
nid = default_nid;
@@ -539,13 +536,26 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
* Figure out to which domain a cpu belongs and stick it there.
* Return the id of the domain used.
*/
-static int __cpuinit numa_setup_cpu(unsigned long lcpu)
+static int numa_setup_cpu(unsigned long lcpu)
{
- int nid = 0;
- struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
+ int nid;
+ struct device_node *cpu;
+
+ /*
+ * If a valid cpu-to-node mapping is already available, use it
+ * directly instead of querying the firmware, since it represents
+ * the most recent mapping notified to us by the platform (eg: VPHN).
+ */
+ if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
+ map_cpu_to_node(lcpu, nid);
+ return nid;
+ }
+
+ cpu = of_get_cpu_node(lcpu, NULL);
if (!cpu) {
WARN_ON(1);
+ nid = 0;
goto out;
}
@@ -561,17 +571,38 @@ out:
return nid;
}
-static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
- unsigned long action,
+static void verify_cpu_node_mapping(int cpu, int node)
+{
+ int base, sibling, i;
+
+ /* Verify that all the threads in the core belong to the same node */
+ base = cpu_first_thread_sibling(cpu);
+
+ for (i = 0; i < threads_per_core; i++) {
+ sibling = base + i;
+
+ if (sibling == cpu || cpu_is_offline(sibling))
+ continue;
+
+ if (cpu_to_node(sibling) != node) {
+ WARN(1, "CPU thread siblings %d and %d don't belong"
+ " to the same node!\n", cpu, sibling);
+ break;
+ }
+ }
+}
+
+static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
unsigned long lcpu = (unsigned long)hcpu;
- int ret = NOTIFY_DONE;
+ int ret = NOTIFY_DONE, nid;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- numa_setup_cpu(lcpu);
+ nid = numa_setup_cpu(lcpu);
+ verify_cpu_node_mapping((int)lcpu, nid);
ret = NOTIFY_OK;
break;
#ifdef CONFIG_HOTPLUG_CPU
@@ -618,7 +649,7 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start,
* Reads the counter for a given entry in
* linux,drconf-usable-memory property
*/
-static inline int __init read_usm_ranges(const u32 **usm)
+static inline int __init read_usm_ranges(const __be32 **usm)
{
/*
* For each lmb in ibm,dynamic-memory a corresponding
@@ -635,11 +666,11 @@ static inline int __init read_usm_ranges(const u32 **usm)
*/
static void __init parse_drconf_memory(struct device_node *memory)
{
- const u32 *dm, *usm;
+ const __be32 *uninitialized_var(dm), *usm;
unsigned int n, rc, ranges, is_kexec_kdump = 0;
unsigned long lmb_size, base, size, sz;
int nid;
- struct assoc_arrays aa;
+ struct assoc_arrays aa = { .arrays = NULL };
n = of_get_drconf_memory(memory, &dm);
if (!n)
@@ -690,7 +721,8 @@ static void __init parse_drconf_memory(struct device_node *memory)
node_set_online(nid);
sz = numa_enforce_memory_limit(base, size);
if (sz)
- memblock_set_node(base, sz, nid);
+ memblock_set_node(base, sz,
+ &memblock.memory, nid);
} while (--ranges);
}
}
@@ -744,7 +776,7 @@ static int __init parse_numa_properties(void)
unsigned long size;
int nid;
int ranges;
- const unsigned int *memcell_buf;
+ const __be32 *memcell_buf;
unsigned int len;
memcell_buf = of_get_property(memory,
@@ -780,7 +812,7 @@ new_range:
continue;
}
- memblock_set_node(start, size, nid);
+ memblock_set_node(start, size, &memblock.memory, nid);
if (--ranges)
goto new_range;
@@ -817,7 +849,8 @@ static void __init setup_nonnuma(void)
fake_numa_create_new_node(end_pfn, &nid);
memblock_set_node(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn), nid);
+ PFN_PHYS(end_pfn - start_pfn),
+ &memblock.memory, nid);
node_set_online(nid);
}
}
@@ -942,7 +975,7 @@ static void __init *careful_zallocation(int nid, unsigned long size,
return ret;
}
-static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+static struct notifier_block ppc64_numa_nb = {
.notifier_call = cpu_numa_callback,
.priority = 1 /* Must run before sched domains notifier. */
};
@@ -958,8 +991,7 @@ static void __init mark_reserved_regions_for_nid(int nid)
unsigned long start_pfn = physbase >> PAGE_SHIFT;
unsigned long end_pfn = PFN_UP(physbase + size);
struct node_active_region node_ar;
- unsigned long node_end_pfn = node->node_start_pfn +
- node->node_spanned_pages;
+ unsigned long node_end_pfn = pgdat_end_pfn(node);
/*
* Check to make sure that this memblock.reserved area is
@@ -1088,6 +1120,7 @@ void __init do_init_bootmem(void)
*/
setup_node_to_cpumask_map();
+ reset_numa_cpu_lookup_table();
register_cpu_notifier(&ppc64_numa_nb);
cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
(void *)(unsigned long)boot_cpuid);
@@ -1129,7 +1162,7 @@ early_param("numa", early_numa);
static int hot_add_drconf_scn_to_nid(struct device_node *memory,
unsigned long scn_addr)
{
- const u32 *dm;
+ const __be32 *dm;
unsigned int drconf_cell_cnt, rc;
unsigned long lmb_size;
struct assoc_arrays aa;
@@ -1174,7 +1207,7 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory,
* represented in the device tree as a node (i.e. memory@XXXX) for
* each memblock.
*/
-int hot_add_node_scn_to_nid(unsigned long scn_addr)
+static int hot_add_node_scn_to_nid(unsigned long scn_addr)
{
struct device_node *memory;
int nid = -1;
@@ -1182,7 +1215,7 @@ int hot_add_node_scn_to_nid(unsigned long scn_addr)
for_each_node_by_type(memory, "memory") {
unsigned long start, size;
int ranges;
- const unsigned int *memcell_buf;
+ const __be32 *memcell_buf;
unsigned int len;
memcell_buf = of_get_property(memory, "reg", &len);
@@ -1255,7 +1288,7 @@ static u64 hot_add_drconf_memory_max(void)
struct device_node *memory = NULL;
unsigned int drconf_cell_cnt = 0;
u64 lmb_size = 0;
- const u32 *dm = 0;
+ const __be32 *dm = NULL;
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory) {
@@ -1280,10 +1313,18 @@ u64 memory_hotplug_max(void)
/* Virtual Processor Home Node (VPHN) support */
#ifdef CONFIG_PPC_SPLPAR
+struct topology_update_data {
+ struct topology_update_data *next;
+ unsigned int cpu;
+ int old_nid;
+ int new_nid;
+};
+
static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
static cpumask_t cpu_associativity_changes_mask;
static int vphn_enabled;
-static void set_topology_timer(void);
+static int prrn_enabled;
+static void reset_topology_timer(void);
/*
* Store the current values of the associativity change counters in the
@@ -1319,11 +1360,9 @@ static void setup_cpu_associativity_change_counters(void)
*/
static int update_cpu_associativity_changes_mask(void)
{
- int cpu, nr_cpus = 0;
+ int cpu;
cpumask_t *changes = &cpu_associativity_changes_mask;
- cpumask_clear(changes);
-
for_each_possible_cpu(cpu) {
int i, changed = 0;
u8 *counts = vphn_cpu_change_counts[cpu];
@@ -1336,12 +1375,12 @@ static int update_cpu_associativity_changes_mask(void)
}
}
if (changed) {
- cpumask_set_cpu(cpu, changes);
- nr_cpus++;
+ cpumask_or(changes, changes, cpu_sibling_mask(cpu));
+ cpu = cpu_last_thread_sibling(cpu);
}
}
- return nr_cpus;
+ return cpumask_weight(changes);
}
/*
@@ -1354,40 +1393,41 @@ static int update_cpu_associativity_changes_mask(void)
* Convert the associativity domain numbers returned from the hypervisor
* to the sequence they would appear in the ibm,associativity property.
*/
-static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
+static int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
{
int i, nr_assoc_doms = 0;
- const u16 *field = (const u16*) packed;
+ const __be16 *field = (const __be16 *) packed;
#define VPHN_FIELD_UNUSED (0xffff)
#define VPHN_FIELD_MSB (0x8000)
#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
- if (*field == VPHN_FIELD_UNUSED) {
+ if (be16_to_cpup(field) == VPHN_FIELD_UNUSED) {
/* All significant fields processed, and remaining
* fields contain the reserved value of all 1's.
* Just store them.
*/
- unpacked[i] = *((u32*)field);
+ unpacked[i] = *((__be32 *)field);
field += 2;
- } else if (*field & VPHN_FIELD_MSB) {
+ } else if (be16_to_cpup(field) & VPHN_FIELD_MSB) {
/* Data is in the lower 15 bits of this field */
- unpacked[i] = *field & VPHN_FIELD_MASK;
+ unpacked[i] = cpu_to_be32(
+ be16_to_cpup(field) & VPHN_FIELD_MASK);
field++;
nr_assoc_doms++;
} else {
/* Data is in the lower 15 bits of this field
* concatenated with the next 16 bit field
*/
- unpacked[i] = *((u32*)field);
+ unpacked[i] = *((__be32 *)field);
field += 2;
nr_assoc_doms++;
}
}
/* The first cell contains the length of the property */
- unpacked[0] = nr_assoc_doms;
+ unpacked[0] = cpu_to_be32(nr_assoc_doms);
return nr_assoc_doms;
}
@@ -1396,7 +1436,7 @@ static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
* Retrieve the new associativity information for a virtual processor's
* home node.
*/
-static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
+static long hcall_vphn(unsigned long cpu, __be32 *associativity)
{
long rc;
long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
@@ -1410,7 +1450,7 @@ static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
}
static long vphn_get_associativity(unsigned long cpu,
- unsigned int *associativity)
+ __be32 *associativity)
{
long rc;
@@ -1433,40 +1473,163 @@ static long vphn_get_associativity(unsigned long cpu,
}
/*
+ * Update the CPU maps and sysfs entries for a single CPU when its NUMA
+ * characteristics change. This function doesn't perform any locking and is
+ * only safe to call from stop_machine().
+ */
+static int update_cpu_topology(void *data)
+{
+ struct topology_update_data *update;
+ unsigned long cpu;
+
+ if (!data)
+ return -EINVAL;
+
+ cpu = smp_processor_id();
+
+ for (update = data; update; update = update->next) {
+ if (cpu != update->cpu)
+ continue;
+
+ unmap_cpu_from_node(update->cpu);
+ map_cpu_to_node(update->cpu, update->new_nid);
+ vdso_getcpu_init();
+ }
+
+ return 0;
+}
+
+static int update_lookup_table(void *data)
+{
+ struct topology_update_data *update;
+
+ if (!data)
+ return -EINVAL;
+
+ /*
+ * Upon topology update, the numa-cpu lookup table needs to be updated
+ * for all threads in the core, including offline CPUs, to ensure that
+ * future hotplug operations respect the cpu-to-node associativity
+ * properly.
+ */
+ for (update = data; update; update = update->next) {
+ int nid, base, j;
+
+ nid = update->new_nid;
+ base = cpu_first_thread_sibling(update->cpu);
+
+ for (j = 0; j < threads_per_core; j++) {
+ update_numa_cpu_lookup_table(base + j, nid);
+ }
+ }
+
+ return 0;
+}
+
+/*
* Update the node maps and sysfs entries for each cpu whose home node
- * has changed.
+ * has changed. Returns 1 when the topology has changed, and 0 otherwise.
*/
int arch_update_cpu_topology(void)
{
- int cpu, nid, old_nid;
- unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ unsigned int cpu, sibling, changed = 0;
+ struct topology_update_data *updates, *ud;
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ cpumask_t updated_cpus;
struct device *dev;
+ int weight, new_nid, i = 0;
- for_each_cpu(cpu,&cpu_associativity_changes_mask) {
- vphn_get_associativity(cpu, associativity);
- nid = associativity_to_nid(associativity);
+ weight = cpumask_weight(&cpu_associativity_changes_mask);
+ if (!weight)
+ return 0;
- if (nid < 0 || !node_online(nid))
- nid = first_online_node;
+ updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
+ if (!updates)
+ return 0;
- old_nid = numa_cpu_lookup_table[cpu];
+ cpumask_clear(&updated_cpus);
- /* Disable hotplug while we update the cpu
- * masks and sysfs.
+ for_each_cpu(cpu, &cpu_associativity_changes_mask) {
+ /*
+ * If siblings aren't flagged for changes, updates list
+ * will be too short. Skip on this update and set for next
+ * update.
*/
- get_online_cpus();
- unregister_cpu_under_node(cpu, old_nid);
- unmap_cpu_from_node(cpu);
- map_cpu_to_node(cpu, nid);
- register_cpu_under_node(cpu, nid);
- put_online_cpus();
-
- dev = get_cpu_device(cpu);
+ if (!cpumask_subset(cpu_sibling_mask(cpu),
+ &cpu_associativity_changes_mask)) {
+ pr_info("Sibling bits not set for associativity "
+ "change, cpu%d\n", cpu);
+ cpumask_or(&cpu_associativity_changes_mask,
+ &cpu_associativity_changes_mask,
+ cpu_sibling_mask(cpu));
+ cpu = cpu_last_thread_sibling(cpu);
+ continue;
+ }
+
+ /* Use associativity from first thread for all siblings */
+ vphn_get_associativity(cpu, associativity);
+ new_nid = associativity_to_nid(associativity);
+ if (new_nid < 0 || !node_online(new_nid))
+ new_nid = first_online_node;
+
+ if (new_nid == numa_cpu_lookup_table[cpu]) {
+ cpumask_andnot(&cpu_associativity_changes_mask,
+ &cpu_associativity_changes_mask,
+ cpu_sibling_mask(cpu));
+ cpu = cpu_last_thread_sibling(cpu);
+ continue;
+ }
+
+ for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
+ ud = &updates[i++];
+ ud->cpu = sibling;
+ ud->new_nid = new_nid;
+ ud->old_nid = numa_cpu_lookup_table[sibling];
+ cpumask_set_cpu(sibling, &updated_cpus);
+ if (i < weight)
+ ud->next = &updates[i];
+ }
+ cpu = cpu_last_thread_sibling(cpu);
+ }
+
+ /*
+ * In cases where we have nothing to update (because the updates list
+ * is too short or because the new topology is same as the old one),
+ * skip invoking update_cpu_topology() via stop-machine(). This is
+ * necessary (and not just a fast-path optimization) since stop-machine
+ * can end up electing a random CPU to run update_cpu_topology(), and
+ * thus trick us into setting up incorrect cpu-node mappings (since
+ * 'updates' is kzalloc()'ed).
+ *
+ * And for the similar reason, we will skip all the following updating.
+ */
+ if (!cpumask_weight(&updated_cpus))
+ goto out;
+
+ stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
+
+ /*
+ * Update the numa-cpu lookup table with the new mappings, even for
+ * offline CPUs. It is best to perform this update from the stop-
+ * machine context.
+ */
+ stop_machine(update_lookup_table, &updates[0],
+ cpumask_of(raw_smp_processor_id()));
+
+ for (ud = &updates[0]; ud; ud = ud->next) {
+ unregister_cpu_under_node(ud->cpu, ud->old_nid);
+ register_cpu_under_node(ud->cpu, ud->new_nid);
+
+ dev = get_cpu_device(ud->cpu);
if (dev)
kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+ cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
+ changed = 1;
}
- return 1;
+out:
+ kfree(updates);
+ return changed;
}
static void topology_work_fn(struct work_struct *work)
@@ -1475,56 +1638,172 @@ static void topology_work_fn(struct work_struct *work)
}
static DECLARE_WORK(topology_work, topology_work_fn);
-void topology_schedule_update(void)
+static void topology_schedule_update(void)
{
schedule_work(&topology_work);
}
static void topology_timer_fn(unsigned long ignored)
{
- if (!vphn_enabled)
- return;
- if (update_cpu_associativity_changes_mask() > 0)
+ if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
topology_schedule_update();
- set_topology_timer();
+ else if (vphn_enabled) {
+ if (update_cpu_associativity_changes_mask() > 0)
+ topology_schedule_update();
+ reset_topology_timer();
+ }
}
static struct timer_list topology_timer =
TIMER_INITIALIZER(topology_timer_fn, 0, 0);
-static void set_topology_timer(void)
+static void reset_topology_timer(void)
{
topology_timer.data = 0;
topology_timer.expires = jiffies + 60 * HZ;
- add_timer(&topology_timer);
+ mod_timer(&topology_timer, topology_timer.expires);
+}
+
+#ifdef CONFIG_SMP
+
+static void stage_topology_update(int core_id)
+{
+ cpumask_or(&cpu_associativity_changes_mask,
+ &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
+ reset_topology_timer();
}
+static int dt_update_callback(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct of_prop_reconfig *update;
+ int rc = NOTIFY_DONE;
+
+ switch (action) {
+ case OF_RECONFIG_UPDATE_PROPERTY:
+ update = (struct of_prop_reconfig *)data;
+ if (!of_prop_cmp(update->dn->type, "cpu") &&
+ !of_prop_cmp(update->prop->name, "ibm,associativity")) {
+ u32 core_id;
+ of_property_read_u32(update->dn, "reg", &core_id);
+ stage_topology_update(core_id);
+ rc = NOTIFY_OK;
+ }
+ break;
+ }
+
+ return rc;
+}
+
+static struct notifier_block dt_update_nb = {
+ .notifier_call = dt_update_callback,
+};
+
+#endif
+
/*
- * Start polling for VPHN associativity changes.
+ * Start polling for associativity changes.
*/
int start_topology_update(void)
{
int rc = 0;
- /* Disabled until races with load balancing are fixed */
- if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
- get_lppaca()->shared_proc) {
- vphn_enabled = 1;
- setup_cpu_associativity_change_counters();
- init_timer_deferrable(&topology_timer);
- set_topology_timer();
- rc = 1;
+ if (firmware_has_feature(FW_FEATURE_PRRN)) {
+ if (!prrn_enabled) {
+ prrn_enabled = 1;
+ vphn_enabled = 0;
+#ifdef CONFIG_SMP
+ rc = of_reconfig_notifier_register(&dt_update_nb);
+#endif
+ }
+ } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
+ lppaca_shared_proc(get_lppaca())) {
+ if (!vphn_enabled) {
+ prrn_enabled = 0;
+ vphn_enabled = 1;
+ setup_cpu_associativity_change_counters();
+ init_timer_deferrable(&topology_timer);
+ reset_topology_timer();
+ }
}
return rc;
}
-__initcall(start_topology_update);
/*
* Disable polling for VPHN associativity changes.
*/
int stop_topology_update(void)
{
- vphn_enabled = 0;
- return del_timer_sync(&topology_timer);
+ int rc = 0;
+
+ if (prrn_enabled) {
+ prrn_enabled = 0;
+#ifdef CONFIG_SMP
+ rc = of_reconfig_notifier_unregister(&dt_update_nb);
+#endif
+ } else if (vphn_enabled) {
+ vphn_enabled = 0;
+ rc = del_timer_sync(&topology_timer);
+ }
+
+ return rc;
+}
+
+int prrn_is_enabled(void)
+{
+ return prrn_enabled;
+}
+
+static int topology_read(struct seq_file *file, void *v)
+{
+ if (vphn_enabled || prrn_enabled)
+ seq_puts(file, "on\n");
+ else
+ seq_puts(file, "off\n");
+
+ return 0;
+}
+
+static int topology_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, topology_read, NULL);
+}
+
+static ssize_t topology_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *off)
+{
+ char kbuf[4]; /* "on" or "off" plus null. */
+ int read_len;
+
+ read_len = count < 3 ? count : 3;
+ if (copy_from_user(kbuf, buf, read_len))
+ return -EINVAL;
+
+ kbuf[read_len] = '\0';
+
+ if (!strncmp(kbuf, "on", 2))
+ start_topology_update();
+ else if (!strncmp(kbuf, "off", 3))
+ stop_topology_update();
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static const struct file_operations topology_ops = {
+ .read = seq_read,
+ .write = topology_write,
+ .open = topology_open,
+ .release = single_release
+};
+
+static int topology_update_init(void)
+{
+ start_topology_update();
+ proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops);
+
+ return 0;
}
+device_initcall(topology_update_init);
#endif /* CONFIG_PPC_SPLPAR */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 214130a4edc..c695943a513 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -24,7 +24,6 @@
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/mm.h>
-#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
@@ -32,8 +31,6 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
-#include "mmu_decl.h"
-
static inline int is_exec_fault(void)
{
return current->thread.regs && TRAP(current->thread.regs) == 0x400;
@@ -72,7 +69,7 @@ struct page * maybe_pte_to_page(pte_t pte)
* support falls into the same category.
*/
-static pte_t set_pte_filter(pte_t pte, unsigned long addr)
+static pte_t set_pte_filter(pte_t pte)
{
pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
@@ -81,17 +78,6 @@ static pte_t set_pte_filter(pte_t pte, unsigned long addr)
if (!pg)
return pte;
if (!test_bit(PG_arch_1, &pg->flags)) {
-#ifdef CONFIG_8xx
- /* On 8xx, cache control instructions (particularly
- * "dcbst" from flush_dcache_icache) fault as write
- * operation if there is an unpopulated TLB entry
- * for the address in question. To workaround that,
- * we invalidate the TLB here, thus avoiding dcbst
- * misbehaviour.
- */
- /* 8xx doesn't care about PID, size or ind args */
- _tlbil_va(addr, 0, 0, 0);
-#endif /* CONFIG_8xx */
flush_dcache_icache_page(pg);
set_bit(PG_arch_1, &pg->flags);
}
@@ -111,7 +97,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
* instead we "filter out" the exec permission for non clean pages.
*/
-static pte_t set_pte_filter(pte_t pte, unsigned long addr)
+static pte_t set_pte_filter(pte_t pte)
{
struct page *pg;
@@ -187,13 +173,13 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte)
{
#ifdef CONFIG_DEBUG_VM
- WARN_ON(pte_present(*ptep));
+ WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
#endif
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
* is called.
*/
- pte = set_pte_filter(pte, addr);
+ pte = set_pte_filter(pte);
/* Perform the setting of the PTE */
__set_pte_at(mm, addr, ptep, pte, 0);
@@ -235,6 +221,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
pud = pud_offset(pgd, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
+ /*
+ * khugepaged to collapse normal pages to hugepage, first set
+ * pmd to none to force page fault/gup to take mmap_sem. After
+ * pmd is set to none, we do a pte_clear which does this assertion
+ * so if we find pmd none, return.
+ */
+ if (pmd_none(*pmd))
+ return;
BUG_ON(!pmd_present(*pmd));
assert_spin_locked(pte_lockptr(mm, pmd));
}
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 51f87956f8f..343a87fa78b 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -33,6 +33,7 @@
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/io.h>
+#include <asm/setup.h>
#include "mmu_decl.h"
@@ -120,7 +121,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
ptepage = alloc_pages(flags, 0);
if (!ptepage)
return NULL;
- pgtable_page_ctor(ptepage);
+ if (!pgtable_page_ctor(ptepage)) {
+ __free_page(ptepage);
+ return NULL;
+ }
return ptepage;
}
@@ -207,7 +211,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
*/
if (mem_init_done && (p < virt_to_phys(high_memory)) &&
!(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) {
- printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n",
+ printk("__ioremap(): phys addr 0x%llx is RAM lr %pf\n",
(unsigned long long)p, __builtin_return_address(0));
return NULL;
}
@@ -295,6 +299,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
__pgprot(flags)));
}
+ smp_wmb();
return err;
}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index ad36ede469c..f6ce1f111f5 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -33,7 +33,6 @@
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
-#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/slab.h>
@@ -51,14 +50,22 @@
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/sections.h>
-#include <asm/system.h>
-#include <asm/abs_addr.h>
#include <asm/firmware.h>
#include "mmu_decl.h"
-unsigned long ioremap_bot = IOREMAP_BASE;
+/* Some sanity checking */
+#if TASK_SIZE_USER64 > PGTABLE_RANGE
+#error TASK_SIZE_USER64 exceeds pagetable range
+#endif
+#ifdef CONFIG_PPC_STD_MMU_64
+#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
+#error TASK_SIZE_USER64 exceeds user VSID range
+#endif
+#endif
+
+unsigned long ioremap_bot = IOREMAP_BASE;
#ifdef CONFIG_PPC_MMU_NOHASH
static void *early_alloc_pgtable(unsigned long size)
@@ -145,6 +152,18 @@ int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
}
#endif /* !CONFIG_PPC_MMU_NOHASH */
}
+
+#ifdef CONFIG_PPC_BOOK3E_64
+ /*
+ * With hardware tablewalk, a sync is needed to ensure that
+ * subsequent accesses see the PTE we just wrote. Unlike userspace
+ * mappings, we can't tolerate spurious faults, so make sure
+ * the new PTE will be seen the first time.
+ */
+ mb();
+#else
+ smp_wmb();
+#endif
return 0;
}
@@ -329,3 +348,543 @@ EXPORT_SYMBOL(__ioremap_at);
EXPORT_SYMBOL(iounmap);
EXPORT_SYMBOL(__iounmap);
EXPORT_SYMBOL(__iounmap_at);
+
+/*
+ * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
+ * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
+ */
+struct page *pmd_page(pmd_t pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(pmd))
+ return pfn_to_page(pmd_pfn(pmd));
+#endif
+ return virt_to_page(pmd_page_vaddr(pmd));
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+static pte_t *get_from_cache(struct mm_struct *mm)
+{
+ void *pte_frag, *ret;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pte_frag;
+ if (ret) {
+ pte_frag = ret + PTE_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
+ pte_frag = NULL;
+ mm->context.pte_frag = pte_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pte_t *)ret;
+}
+
+static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
+{
+ void *ret = NULL;
+ struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
+ __GFP_REPEAT | __GFP_ZERO);
+ if (!page)
+ return NULL;
+ if (!kernel && !pgtable_page_ctor(page)) {
+ __free_page(page);
+ return NULL;
+ }
+
+ ret = page_address(page);
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!mm->context.pte_frag)) {
+ atomic_set(&page->_count, PTE_FRAG_NR);
+ mm->context.pte_frag = ret + PTE_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pte_t *)ret;
+}
+
+pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
+{
+ pte_t *pte;
+
+ pte = get_from_cache(mm);
+ if (pte)
+ return pte;
+
+ return __alloc_for_cache(mm, kernel);
+}
+
+void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
+{
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ if (!kernel)
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+#ifdef CONFIG_SMP
+static void page_table_free_rcu(void *table)
+{
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+ unsigned long pgf = (unsigned long)table;
+
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ pgf |= shift;
+ tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+ void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+ unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+ if (!shift)
+ /* PTE page needs special handling */
+ page_table_free_rcu(table);
+ else {
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ kmem_cache_free(PGT_CACHE(shift), table);
+ }
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+ if (!shift) {
+ /* PTE page needs special handling */
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+ } else {
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ kmem_cache_free(PGT_CACHE(shift), table);
+ }
+}
+#endif
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp, pmd_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+ changed = !pmd_same(*(pmdp), entry);
+ if (changed) {
+ __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+ /*
+ * Since we are not supporting SW TLB systems, we don't
+ * have any thing similar to flush_tlb_page_nohash()
+ */
+ }
+ return changed;
+}
+
+unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set)
+{
+
+ unsigned long old, tmp;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ andi. %1,%0,%6\n\
+ bne- 1b \n\
+ andc %1,%0,%4 \n\
+ or %1,%1,%7\n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
+ : "cc" );
+#else
+ old = pmd_val(*pmdp);
+ *pmdp = __pmd((old & ~clr) | set);
+#endif
+ if (old & _PAGE_HASHPTE)
+ hpte_do_hugepage_flush(mm, addr, pmdp);
+ return old;
+}
+
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ if (pmd_trans_huge(*pmdp)) {
+ pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ } else {
+ /*
+ * khugepaged calls this for normal pmd
+ */
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+ /*
+ * Wait for all pending hash_page to finish. This is needed
+ * in case of subpage collapse. When we collapse normal pages
+ * to hugepage, we first clear the pmd, then invalidate all
+ * the PTE entries. The assumption here is that any low level
+ * page fault will see a none pmd and take the slow path that
+ * will wait on mmap_sem. But we could very well be in a
+ * hash_page with local ptep pointer value. Such a hash page
+ * can result in adding new HPTE entries for normal subpages.
+ * That means we could be modifying the page content as we
+ * copy them to a huge page. So wait for parallel hash_page
+ * to finish before invalidating HPTE entries. We can do this
+ * by sending an IPI to all the cpus and executing a dummy
+ * function there.
+ */
+ kick_all_cpus_sync();
+ /*
+ * Now invalidate the hpte entries in the range
+ * covered by pmd. This make sure we take a
+ * fault and will find the pmd as none, which will
+ * result in a major fault which takes mmap_sem and
+ * hence wait for collapse to complete. Without this
+ * the __collapse_huge_page_copy can result in copying
+ * the old content.
+ */
+ flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+ }
+ return pmd;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We mark the pmd splitting and invalidate all the hpte
+ * entries for this hugepage.
+ */
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ unsigned long old, tmp;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ andi. %1,%0,%6\n\
+ bne- 1b \n\
+ ori %1,%0,%4 \n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
+ : "cc" );
+#else
+ old = pmd_val(*pmdp);
+ *pmdp = __pmd(old | _PAGE_SPLITTING);
+#endif
+ /*
+ * If we didn't had the splitting flag set, go and flush the
+ * HPTE entries.
+ */
+ if (!(old & _PAGE_SPLITTING)) {
+ /* We need to flush the hpte */
+ if (old & _PAGE_HASHPTE)
+ hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+ }
+ /*
+ * This ensures that generic code that rely on IRQ disabling
+ * to prevent a parallel THP split work as expected.
+ */
+ kick_all_cpus_sync();
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pgtable_t *pgtable_slot;
+ assert_spin_locked(&mm->page_table_lock);
+ /*
+ * we store the pgtable in the second half of PMD
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ *pgtable_slot = pgtable;
+ /*
+ * expose the deposited pgtable to other cpus.
+ * before we set the hugepage PTE at pmd level
+ * hash fault code looks at the deposted pgtable
+ * to store hash index values.
+ */
+ smp_wmb();
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pgtable_t pgtable;
+ pgtable_t *pgtable_slot;
+
+ assert_spin_locked(&mm->page_table_lock);
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Once we withdraw, mark the entry NULL.
+ */
+ *pgtable_slot = NULL;
+ /*
+ * We store HPTE information in the deposited PTE fragment.
+ * zero out the content on withdraw.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return pgtable;
+}
+
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
+ assert_spin_locked(&mm->page_table_lock);
+ WARN_ON(!pmd_trans_huge(pmd));
+#endif
+ return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ int ssize, i;
+ unsigned long s_addr;
+ int max_hpte_count;
+ unsigned int psize, valid;
+ unsigned char *hpte_slot_array;
+ unsigned long hidx, vpn, vsid, hash, shift, slot;
+
+ /*
+ * Flush all the hptes mapping this hugepage
+ */
+ s_addr = addr & HPAGE_PMD_MASK;
+ hpte_slot_array = get_hpte_slot_array(pmdp);
+ /*
+ * IF we try to do a HUGE PTE update after a withdraw is done.
+ * we will find the below NULL. This happens when we do
+ * split_huge_page_pmd
+ */
+ if (!hpte_slot_array)
+ return;
+
+ /* get the base page size */
+ psize = get_slice_psize(mm, s_addr);
+
+ if (ppc_md.hugepage_invalidate)
+ return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
+ s_addr, psize);
+ /*
+ * No bluk hpte removal support, invalidate each entry
+ */
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = HPAGE_PMD_SIZE >> shift;
+ for (i = 0; i < max_hpte_count; i++) {
+ /*
+ * 8 bits per each hpte entries
+ * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+ */
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_vsid(mm->context.id, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ ppc_md.hpte_invalidate(slot, vpn, psize,
+ MMU_PAGE_16M, ssize, 0);
+ }
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+ pmd_val(pmd) |= pgprot_val(pgprot);
+ return pmd;
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+ pmd_t pmd;
+ /*
+ * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
+ * set. We use this to check THP page at pmd level.
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
+ pmd_val(pmd) |= _PAGE_THP_HUGE;
+ pmd = pmd_set_protbits(pmd, pgprot);
+ return pmd;
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+ return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+
+ pmd_val(pmd) &= _HPAGE_CHG_MASK;
+ pmd = pmd_set_protbits(pmd, newprot);
+ return pmd;
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd)
+{
+ return;
+}
+
+pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old_pmd;
+ pgtable_t pgtable;
+ unsigned long old;
+ pgtable_t *pgtable_slot;
+
+ old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+ old_pmd = __pmd(old);
+ /*
+ * We have pmd == none and we are holding page_table_lock.
+ * So we can safely go and clear the pgtable hash
+ * index info.
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Let's zero out old valid and hash index details
+ * hash fault look at them.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return old_pmd;
+}
+
+int has_transparent_hugepage(void)
+{
+ if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+ return 0;
+ /*
+ * We support THP only if PMD_SIZE is 16MB.
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+ return 0;
+ /*
+ * We need to make sure that we support 16MB hugepage in a segement
+ * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+ * of 64K.
+ */
+ /*
+ * If we have 64K HPTE, we will be using that by default
+ */
+ if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+ (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+ return 0;
+ /*
+ * Ok we only have 4K HPTE
+ */
+ if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+ return 0;
+
+ return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index e22276cb67a..0399a670295 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -21,7 +21,6 @@
#include <asm/cputable.h>
#include <asm/cacheflush.h>
#include <asm/smp.h>
-#include <asm/firmware.h>
#include <linux/compiler.h>
#include <asm/udbg.h>
#include <asm/code-patching.h>
@@ -67,8 +66,10 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
* we only update the current CPU's SLB shadow buffer.
*/
get_slb_shadow()->save_area[entry].esid = 0;
- get_slb_shadow()->save_area[entry].vsid = mk_vsid_data(ea, ssize, flags);
- get_slb_shadow()->save_area[entry].esid = mk_esid_data(ea, ssize, entry);
+ get_slb_shadow()->save_area[entry].vsid =
+ cpu_to_be64(mk_vsid_data(ea, ssize, flags));
+ get_slb_shadow()->save_area[entry].esid =
+ cpu_to_be64(mk_esid_data(ea, ssize, entry));
}
static inline void slb_shadow_clear(unsigned long entry)
@@ -96,7 +97,7 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
static void __slb_flush_and_rebolt(void)
{
/* If you change this make sure you change SLB_NUM_BOLTED
- * appropriately too. */
+ * and PR KVM appropriately too. */
unsigned long linear_llp, vmalloc_llp, lflags, vflags;
unsigned long ksp_esid_data, ksp_vsid_data;
@@ -113,7 +114,8 @@ static void __slb_flush_and_rebolt(void)
} else {
/* Update stack entry; others don't change */
slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2);
- ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
+ ksp_vsid_data =
+ be64_to_cpu(get_slb_shadow()->save_area[2].vsid);
}
/* We need to do this all in asm, so we're sure we don't touch
@@ -254,10 +256,14 @@ static inline void patch_slb_encoding(unsigned int *insn_addr,
patch_instruction(insn_addr, insn);
}
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_linear[];
+extern u32 slb_miss_kernel_load_io[];
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_vmemmap[];
+
void slb_set_size(u16 size)
{
- extern unsigned int *slb_compare_rr_to_size;
-
if (mmu_slb_size == size)
return;
@@ -270,11 +276,7 @@ void slb_initialize(void)
unsigned long linear_llp, vmalloc_llp, io_llp;
unsigned long lflags, vflags;
static int slb_encoding_inited;
- extern unsigned int *slb_miss_kernel_load_linear;
- extern unsigned int *slb_miss_kernel_load_io;
- extern unsigned int *slb_compare_rr_to_size;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
- extern unsigned int *slb_miss_kernel_load_vmemmap;
unsigned long vmemmap_llp;
#endif
@@ -307,11 +309,6 @@ void slb_initialize(void)
get_paca()->stab_rr = SLB_NUM_BOLTED;
- /* On iSeries the bolted entries have already been set up by
- * the hypervisor from the lparMap data in head.S */
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- return;
-
lflags = SLB_VSID_KERNEL | linear_llp;
vflags = SLB_VSID_KERNEL | vmalloc_llp;
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index ef653dc95b6..736d18b3cef 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -31,10 +31,15 @@
* No other registers are examined or changed.
*/
_GLOBAL(slb_allocate_realmode)
- /* r3 = faulting address */
+ /*
+ * check for bad kernel/user address
+ * (ea & ~REGION_MASK) >= PGTABLE_RANGE
+ */
+ rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4)
+ bne- 8f
srdi r9,r3,60 /* get region */
- srdi r10,r3,28 /* get esid */
+ srdi r10,r3,SID_SHIFT /* get esid */
cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */
/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
@@ -54,8 +59,17 @@ _GLOBAL(slb_allocate_realmode)
/* Linear mapping encoding bits, the "li" instruction below will
* be patched by the kernel at boot
*/
-_GLOBAL(slb_miss_kernel_load_linear)
+.globl slb_miss_kernel_load_linear
+slb_miss_kernel_load_linear:
li r11,0
+ /*
+ * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * r9 = region id.
+ */
+ addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
+
BEGIN_FTR_SECTION
b slb_finish_load
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
@@ -66,7 +80,8 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
/* Check virtual memmap region. To be patches at kernel boot */
cmpldi cr0,r9,0xf
bne 1f
-_GLOBAL(slb_miss_kernel_load_vmemmap)
+.globl slb_miss_kernel_load_vmemmap
+slb_miss_kernel_load_vmemmap:
li r11,0
b 6f
1:
@@ -82,21 +97,23 @@ _GLOBAL(slb_miss_kernel_load_vmemmap)
b 6f
5:
/* IO mapping */
- _GLOBAL(slb_miss_kernel_load_io)
+.globl slb_miss_kernel_load_io
+slb_miss_kernel_load_io:
li r11,0
6:
+ /*
+ * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * r9 = region id.
+ */
+ addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
BEGIN_FTR_SECTION
b slb_finish_load
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
b slb_finish_load_1T
-0: /* user address: proto-VSID = context << 15 | ESID. First check
- * if the address is within the boundaries of the user region
- */
- srdi. r9,r10,USER_ESID_BITS
- bne- 8f /* invalid ea bits set */
-
-
+0:
/* when using slices, we extract the psize off the slice bitmaps
* and then we need to get the sllp encoding off the mmu_psize_defs
* array.
@@ -108,17 +125,31 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
* between 4k and 64k standard page size
*/
#ifdef CONFIG_PPC_MM_SLICES
+ /* r10 have esid */
cmpldi r10,16
-
- /* Get the slice index * 4 in r11 and matching slice size mask in r9 */
- ld r9,PACALOWSLICESPSIZE(r13)
- sldi r11,r10,2
+ /* below SLICE_LOW_TOP */
blt 5f
- ld r9,PACAHIGHSLICEPSIZE(r13)
- srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2)
- andi. r11,r11,0x3c
+ /*
+ * Handle hpsizes,
+ * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index
+ */
+ srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */
+ addi r9,r11,PACAHIGHSLICEPSIZE
+ lbzx r9,r13,r9 /* r9 is hpsizes[r11] */
+ /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */
+ rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63
+ b 6f
-5: /* Extract the psize and multiply to get an array offset */
+5:
+ /*
+ * Handle lpsizes
+ * r9 is get_paca()->context.low_slices_psize, r11 is index
+ */
+ ld r9,PACALOWSLICESPSIZE(r13)
+ mr r11,r10
+6:
+ sldi r11,r11,2 /* index * 4 */
+ /* Extract the psize and multiply to get an array offset */
srd r9,r9,r11
andi. r9,r9,0xf
mulli r9,r9,MMUPSIZEDEFSIZE
@@ -138,15 +169,13 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
ld r9,PACACONTEXTID(r13)
BEGIN_FTR_SECTION
cmpldi r10,0x1000
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- rldimi r10,r9,USER_ESID_BITS,0
-BEGIN_FTR_SECTION
bge slb_finish_load_1T
END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
b slb_finish_load
8: /* invalid EA */
li r10,0 /* BAD_VSID */
+ li r9,0 /* BAD_VSID */
li r11,SLB_VSID_USER /* flags don't much matter */
b slb_finish_load
@@ -195,8 +224,6 @@ _GLOBAL(slb_allocate_user)
/* get context to calculate proto-VSID */
ld r9,PACACONTEXTID(r13)
- rldimi r10,r9,USER_ESID_BITS,0
-
/* fall through slb_finish_load */
#endif /* __DISABLED__ */
@@ -205,11 +232,16 @@ _GLOBAL(slb_allocate_user)
/*
* Finish loading of an SLB entry and return
*
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
+ * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
*/
slb_finish_load:
+ rldimi r10,r9,ESID_BITS,0
ASM_VSID_SCRAMBLE(r10,r9,256M)
- rldimi r11,r10,SLB_VSID_SHIFT,16 /* combine VSID and flags */
+ /*
+ * bits above VSID_BITS_256M need to be ignored from r10
+ * also combine VSID and flags
+ */
+ rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M))
/* r3 = EA, r11 = VSID data */
/*
@@ -217,26 +249,12 @@ slb_finish_load:
* free slot first but that took too long. Unfortunately we
* dont have any LRU information to help us choose a slot.
*/
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
- /*
- * On iSeries, the "bolted" stack segment can be cast out on
- * shared processor switch so we need to check for a miss on
- * it and restore it to the right slot.
- */
- ld r9,PACAKSAVE(r13)
- clrrdi r9,r9,28
- clrrdi r3,r3,28
- li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */
- cmpld r9,r3
- beq 3f
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
7: ld r10,PACASTABRR(r13)
addi r10,r10,1
/* This gets soft patched on boot. */
-_GLOBAL(slb_compare_rr_to_size)
+.globl slb_compare_rr_to_size
+slb_compare_rr_to_size:
cmpldi r10,0
blt+ 4f
@@ -267,10 +285,10 @@ _GLOBAL(slb_compare_rr_to_size)
bge 1f
/* still room in the slb cache */
- sldi r11,r3,1 /* r11 = offset * sizeof(u16) */
- rldicl r10,r10,36,28 /* get low 16 bits of the ESID */
- add r11,r11,r13 /* r11 = (u16 *)paca + offset */
- sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
+ sldi r11,r3,2 /* r11 = offset * sizeof(u32) */
+ srdi r10,r10,28 /* get the 36 bits of the ESID */
+ add r11,r11,r13 /* r11 = (u32 *)paca + offset */
+ stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
addi r3,r3,1 /* offset++ */
b 2f
1: /* offset >= SLB_CACHE_ENTRIES */
@@ -282,14 +300,18 @@ _GLOBAL(slb_compare_rr_to_size)
/*
* Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
- * We assume legacy iSeries will never have 1T segments.
*
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9
+ * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9
*/
slb_finish_load_1T:
- srdi r10,r10,40-28 /* get 1T ESID */
+ srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
+ rldimi r10,r9,ESID_BITS_1T,0
ASM_VSID_SCRAMBLE(r10,r9,1T)
- rldimi r11,r10,SLB_VSID_SHIFT_1T,16 /* combine VSID and flags */
+ /*
+ * bits above VSID_BITS_1T need to be ignored from r10
+ * also combine VSID and flags
+ */
+ rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T))
li r10,MMU_SEGSIZE_1T
rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 73709f7ce92..b0c75cc15ef 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -34,6 +34,11 @@
#include <asm/mmu.h>
#include <asm/spu.h>
+/* some sanity checks */
+#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
+#error PGTABLE_RANGE exceeds slice_mask high_slices size
+#endif
+
static DEFINE_SPINLOCK(slice_convert_lock);
@@ -42,7 +47,7 @@ int _slice_debug = 1;
static void slice_print_mask(const char *label, struct slice_mask mask)
{
- char *p, buf[16 + 3 + 16 + 1];
+ char *p, buf[16 + 3 + 64 + 1];
int i;
if (!_slice_debug)
@@ -54,7 +59,7 @@ static void slice_print_mask(const char *label, struct slice_mask mask)
*(p++) = '-';
*(p++) = ' ';
for (i = 0; i < SLICE_NUM_HIGH; i++)
- *(p++) = (mask.high_slices & (1 << i)) ? '1' : '0';
+ *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
*(p++) = 0;
printk(KERN_DEBUG "%s:%s\n", label, buf);
@@ -84,8 +89,8 @@ static struct slice_mask slice_range_to_mask(unsigned long start,
}
if ((start + len) > SLICE_LOW_TOP)
- ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1))
- - (1u << GET_HIGH_SLICE_INDEX(start));
+ ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
+ - (1ul << GET_HIGH_SLICE_INDEX(start));
return ret;
}
@@ -135,26 +140,31 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
for (i = 0; i < SLICE_NUM_HIGH; i++)
if (!slice_high_has_vma(mm, i))
- ret.high_slices |= 1u << i;
+ ret.high_slices |= 1ul << i;
return ret;
}
static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
{
+ unsigned char *hpsizes;
+ int index, mask_index;
struct slice_mask ret = { 0, 0 };
unsigned long i;
- u64 psizes;
+ u64 lpsizes;
- psizes = mm->context.low_slices_psize;
+ lpsizes = mm->context.low_slices_psize;
for (i = 0; i < SLICE_NUM_LOW; i++)
- if (((psizes >> (i * 4)) & 0xf) == psize)
+ if (((lpsizes >> (i * 4)) & 0xf) == psize)
ret.low_slices |= 1u << i;
- psizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++)
- if (((psizes >> (i * 4)) & 0xf) == psize)
- ret.high_slices |= 1u << i;
+ hpsizes = mm->context.high_slices_psize;
+ for (i = 0; i < SLICE_NUM_HIGH; i++) {
+ mask_index = i & 0x1;
+ index = i >> 1;
+ if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
+ ret.high_slices |= 1ul << i;
+ }
return ret;
}
@@ -183,8 +193,10 @@ static void slice_flush_segments(void *parm)
static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
{
+ int index, mask_index;
/* Write the new slice psize bits */
- u64 lpsizes, hpsizes;
+ unsigned char *hpsizes;
+ u64 lpsizes;
unsigned long i, flags;
slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
@@ -201,14 +213,18 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
lpsizes = (lpsizes & ~(0xful << (i * 4))) |
(((unsigned long)psize) << (i * 4));
- hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++)
- if (mask.high_slices & (1u << i))
- hpsizes = (hpsizes & ~(0xful << (i * 4))) |
- (((unsigned long)psize) << (i * 4));
-
+ /* Assign the value back */
mm->context.low_slices_psize = lpsizes;
- mm->context.high_slices_psize = hpsizes;
+
+ hpsizes = mm->context.high_slices_psize;
+ for (i = 0; i < SLICE_NUM_HIGH; i++) {
+ mask_index = i & 0x1;
+ index = i >> 1;
+ if (mask.high_slices & (1ul << i))
+ hpsizes[index] = (hpsizes[index] &
+ ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
slice_dbg(" lsps=%lx, hsps=%lx\n",
mm->context.low_slices_psize,
@@ -221,134 +237,112 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
#endif
}
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+ struct slice_mask available,
+ int end,
+ unsigned long *boundary_addr)
+{
+ unsigned long slice;
+ if (addr < SLICE_LOW_TOP) {
+ slice = GET_LOW_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+ return !!(available.low_slices & (1u << slice));
+ } else {
+ slice = GET_HIGH_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) ?
+ ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+ return !!(available.high_slices & (1ul << slice));
+ }
+}
+
static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize, int use_cache)
+ int psize)
{
- struct vm_area_struct *vma;
- unsigned long start_addr, addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
-
- if (use_cache) {
- if (len <= mm->cached_hole_size) {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- } else
- start_addr = addr = mm->free_area_cache;
- } else
- start_addr = addr = TASK_UNMAPPED_BASE;
-
-full_search:
- for (;;) {
- addr = _ALIGN_UP(addr, 1ul << pshift);
- if ((TASK_SIZE - len) < addr)
- break;
- vma = find_vma(mm, addr);
- BUG_ON(vma && (addr >= vma->vm_end));
-
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_LOW_SHIFT);
- else
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_HIGH_SHIFT);
+ unsigned long addr, found, next_end;
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
+
+ addr = TASK_UNMAPPED_BASE;
+ while (addr < TASK_SIZE) {
+ info.low_limit = addr;
+ if (!slice_scan_available(addr, available, 1, &addr))
continue;
+
+ next_slice:
+ /*
+ * At this point [info.low_limit; addr) covers
+ * available slices only and ends at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the next available slice.
+ */
+ if (addr >= TASK_SIZE)
+ addr = TASK_SIZE;
+ else if (slice_scan_available(addr, available, 1, &next_end)) {
+ addr = next_end;
+ goto next_slice;
}
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- if (use_cache)
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- }
+ info.high_limit = addr;
- /* Make sure we didn't miss any holes */
- if (use_cache && start_addr != TASK_UNMAPPED_BASE) {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- goto full_search;
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
+
return -ENOMEM;
}
static unsigned long slice_find_area_topdown(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize, int use_cache)
+ int psize)
{
- struct vm_area_struct *vma;
- unsigned long addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long addr, found, prev;
+ struct vm_unmapped_area_info info;
- /* check if free_area_cache is useful for us */
- if (use_cache) {
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
- /* either no address requested or can't fit in requested
- * address hole
- */
- addr = mm->free_area_cache;
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
- mask = slice_range_to_mask(addr, len);
- if (slice_check_fit(mask, available) &&
- slice_area_is_free(mm, addr, len))
- /* remember the address as a hint for
- * next time
- */
- return (mm->free_area_cache = addr);
- }
- }
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
addr = mm->mmap_base;
- while (addr > len) {
- /* Go down by chunk size */
- addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
-
- /* Check for hit with different page size */
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
- else if (addr < (1ul << SLICE_HIGH_SHIFT))
- addr = SLICE_LOW_TOP;
- else
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+ while (addr > PAGE_SIZE) {
+ info.high_limit = addr;
+ if (!slice_scan_available(addr - 1, available, 0, &addr))
continue;
- }
+ prev_slice:
/*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
+ * At this point [addr; info.high_limit) covers
+ * available slices only and starts at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the previous available slice.
*/
- vma = find_vma(mm, addr);
- if (!vma || (addr + len) <= vma->vm_start) {
- /* remember the address as a hint for next time */
- if (use_cache)
- mm->free_area_cache = addr;
- return addr;
+ if (addr < PAGE_SIZE)
+ addr = PAGE_SIZE;
+ else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+ addr = prev;
+ goto prev_slice;
}
+ info.low_limit = addr;
- /* remember the largest hole we saw so far */
- if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start;
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
/*
@@ -357,28 +351,18 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
* can happen with large stack limits and large mmap()
* allocations.
*/
- addr = slice_find_area_bottomup(mm, len, available, psize, 0);
-
- /*
- * Restore the topdown base:
- */
- if (use_cache) {
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
- }
-
- return addr;
+ return slice_find_area_bottomup(mm, len, available, psize);
}
static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
struct slice_mask mask, int psize,
- int topdown, int use_cache)
+ int topdown)
{
if (topdown)
- return slice_find_area_topdown(mm, len, mask, psize, use_cache);
+ return slice_find_area_topdown(mm, len, mask, psize);
else
- return slice_find_area_bottomup(mm, len, mask, psize, use_cache);
+ return slice_find_area_bottomup(mm, len, mask, psize);
}
#define or_mask(dst, src) do { \
@@ -399,7 +383,7 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
unsigned long flags, unsigned int psize,
- int topdown, int use_cache)
+ int topdown)
{
struct slice_mask mask = {0, 0};
struct slice_mask good_mask;
@@ -414,8 +398,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
BUG_ON(mm->task_size == 0);
slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
- slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n",
- addr, len, flags, topdown, use_cache);
+ slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
+ addr, len, flags, topdown);
if (len > mm->task_size)
return -ENOMEM;
@@ -424,7 +408,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
if (fixed && (addr & ((1ul << pshift) - 1)))
return -EINVAL;
if (fixed && addr > (mm->task_size - len))
- return -EINVAL;
+ return -ENOMEM;
/* If hint, make sure it matches our alignment restrictions */
if (!fixed && addr) {
@@ -487,8 +471,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* Now let's see if we can find something in the existing
* slices for that size
*/
- newaddr = slice_find_area(mm, len, good_mask, psize, topdown,
- use_cache);
+ newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
if (newaddr != -ENOMEM) {
/* Found within the good mask, we don't have to setup,
* we thus return directly
@@ -520,8 +503,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
* anywhere in the good area.
*/
if (addr) {
- addr = slice_find_area(mm, len, good_mask, psize, topdown,
- use_cache);
+ addr = slice_find_area(mm, len, good_mask, psize, topdown);
if (addr != -ENOMEM) {
slice_dbg(" found area at 0x%lx\n", addr);
return addr;
@@ -531,15 +513,14 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* Now let's see if we can find something in the existing slices
* for that size plus free slices
*/
- addr = slice_find_area(mm, len, potential_mask, psize, topdown,
- use_cache);
+ addr = slice_find_area(mm, len, potential_mask, psize, topdown);
#ifdef CONFIG_PPC_64K_PAGES
if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
/* retry the search with 4k-page slices included */
or_mask(potential_mask, compat_mask);
addr = slice_find_area(mm, len, potential_mask, psize,
- topdown, use_cache);
+ topdown);
}
#endif
@@ -570,8 +551,7 @@ unsigned long arch_get_unmapped_area(struct file *filp,
unsigned long flags)
{
return slice_get_unmapped_area(addr, len, flags,
- current->mm->context.user_psize,
- 0, 1);
+ current->mm->context.user_psize, 0);
}
unsigned long arch_get_unmapped_area_topdown(struct file *filp,
@@ -581,24 +561,24 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
const unsigned long flags)
{
return slice_get_unmapped_area(addr0, len, flags,
- current->mm->context.user_psize,
- 1, 1);
+ current->mm->context.user_psize, 1);
}
unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
{
- u64 psizes;
- int index;
+ unsigned char *hpsizes;
+ int index, mask_index;
if (addr < SLICE_LOW_TOP) {
- psizes = mm->context.low_slices_psize;
+ u64 lpsizes;
+ lpsizes = mm->context.low_slices_psize;
index = GET_LOW_SLICE_INDEX(addr);
- } else {
- psizes = mm->context.high_slices_psize;
- index = GET_HIGH_SLICE_INDEX(addr);
+ return (lpsizes >> (index * 4)) & 0xf;
}
-
- return (psizes >> (index * 4)) & 0xf;
+ hpsizes = mm->context.high_slices_psize;
+ index = GET_HIGH_SLICE_INDEX(addr);
+ mask_index = index & 0x1;
+ return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf;
}
EXPORT_SYMBOL_GPL(get_slice_psize);
@@ -618,7 +598,9 @@ EXPORT_SYMBOL_GPL(get_slice_psize);
*/
void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
{
- unsigned long flags, lpsizes, hpsizes;
+ int index, mask_index;
+ unsigned char *hpsizes;
+ unsigned long flags, lpsizes;
unsigned int old_psize;
int i;
@@ -639,15 +621,21 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
if (((lpsizes >> (i * 4)) & 0xf) == old_psize)
lpsizes = (lpsizes & ~(0xful << (i * 4))) |
(((unsigned long)psize) << (i * 4));
+ /* Assign the value back */
+ mm->context.low_slices_psize = lpsizes;
hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++)
- if (((hpsizes >> (i * 4)) & 0xf) == old_psize)
- hpsizes = (hpsizes & ~(0xful << (i * 4))) |
- (((unsigned long)psize) << (i * 4));
+ for (i = 0; i < SLICE_NUM_HIGH; i++) {
+ mask_index = i & 0x1;
+ index = i >> 1;
+ if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize)
+ hpsizes[index] = (hpsizes[index] &
+ ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
+
+
- mm->context.low_slices_psize = lpsizes;
- mm->context.high_slices_psize = hpsizes;
slice_dbg(" lsps=%lx, hsps=%lx\n",
mm->context.low_slices_psize,
@@ -660,18 +648,27 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
void slice_set_psize(struct mm_struct *mm, unsigned long address,
unsigned int psize)
{
+ unsigned char *hpsizes;
unsigned long i, flags;
- u64 *p;
+ u64 *lpsizes;
spin_lock_irqsave(&slice_convert_lock, flags);
if (address < SLICE_LOW_TOP) {
i = GET_LOW_SLICE_INDEX(address);
- p = &mm->context.low_slices_psize;
+ lpsizes = &mm->context.low_slices_psize;
+ *lpsizes = (*lpsizes & ~(0xful << (i * 4))) |
+ ((unsigned long) psize << (i * 4));
} else {
+ int index, mask_index;
i = GET_HIGH_SLICE_INDEX(address);
- p = &mm->context.high_slices_psize;
+ hpsizes = mm->context.high_slices_psize;
+ mask_index = i & 0x1;
+ index = i >> 1;
+ hpsizes[index] = (hpsizes[index] &
+ ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
}
- *p = (*p & ~(0xful << (i * 4))) | ((unsigned long) psize << (i * 4));
+
spin_unlock_irqrestore(&slice_convert_lock, flags);
#ifdef CONFIG_SPU_BASE
@@ -688,7 +685,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
}
/*
- * is_hugepage_only_range() is used by generic code to verify wether
+ * is_hugepage_only_range() is used by generic code to verify whether
* a normal mmap mapping (non hugetlbfs) is valid on a given area.
*
* until the generic code provides a more generic hook and/or starts
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 41e31642a86..3f8efa6f299 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -20,9 +20,6 @@
#include <asm/paca.h>
#include <asm/cputable.h>
#include <asm/prom.h>
-#include <asm/abs_addr.h>
-#include <asm/firmware.h>
-#include <asm/iseries/hv_call.h>
struct stab_entry {
unsigned long esid_data;
@@ -259,7 +256,7 @@ void __init stabs_alloc(void)
memset((void *)newstab, 0, HW_PAGE_SIZE);
paca[cpu].stab_addr = newstab;
- paca[cpu].stab_real = virt_to_abs(newstab);
+ paca[cpu].stab_real = __pa(newstab);
printk(KERN_INFO "Segment table for CPU %d at 0x%llx "
"virtual, 0x%llx absolute\n",
cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
@@ -285,12 +282,5 @@ void stab_initialize(unsigned long stab)
/* Set ASR */
stabreal = get_paca()->stab_real | 0x1ul;
-#ifdef CONFIG_PPC_ISERIES
- if (firmware_has_feature(FW_FEATURE_ISERIES)) {
- HvCall1(HvCallBaseSetASR, stabreal);
- return;
- }
-#endif /* CONFIG_PPC_ISERIES */
-
mtspr(SPRN_ASR, stabreal);
}
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index e4f8f1fc81a..6c0b1f5f8d2 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -78,7 +78,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
for (; npages > 0; --npages) {
- pte_update(mm, addr, pte, 0, 0);
+ pte_update(mm, addr, pte, 0, 0, 0);
addr += PAGE_SIZE;
++pte;
}
@@ -95,7 +95,8 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
struct mm_struct *mm = current->mm;
struct subpage_prot_table *spt = &mm->context.spt;
u32 **spm, *spp;
- int i, nw;
+ unsigned long i;
+ size_t nw;
unsigned long next, limit;
down_write(&mm->mmap_sem);
@@ -104,7 +105,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
limit = spt->maxaddr;
for (; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
- if (addr < 0x100000000) {
+ if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
@@ -129,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
up_write(&mm->mmap_sem);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->private;
+ split_huge_page_pmd(vma, addr, pmd);
+ return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ struct vm_area_struct *vma;
+ struct mm_walk subpage_proto_walk = {
+ .mm = mm,
+ .pmd_entry = subpage_walk_pmd_entry,
+ };
+
+ /*
+ * We don't try too hard, we just mark all the vma in that range
+ * VM_NOHUGEPAGE and split them.
+ */
+ vma = find_vma(mm, addr);
+ /*
+ * If the range is in unmapped range, just return
+ */
+ if (vma && ((addr + len) <= vma->vm_start))
+ return;
+
+ while (vma) {
+ if (vma->vm_start >= (addr + len))
+ break;
+ vma->vm_flags |= VM_NOHUGEPAGE;
+ subpage_proto_walk.private = vma;
+ walk_page_range(vma->vm_start, vma->vm_end,
+ &subpage_proto_walk);
+ vma = vma->vm_next;
+ }
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ return;
+}
+#endif
+
/*
* Copy in a subpage protection map for an address range.
* The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -144,7 +192,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
struct mm_struct *mm = current->mm;
struct subpage_prot_table *spt = &mm->context.spt;
u32 **spm, *spp;
- int i, nw;
+ unsigned long i;
+ size_t nw;
unsigned long next, limit;
int err;
@@ -166,10 +215,11 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
return -EFAULT;
down_write(&mm->mmap_sem);
+ subpage_mark_vma_nohuge(mm, addr, len);
for (limit = addr + len; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
err = -ENOMEM;
- if (addr < 0x100000000) {
+ if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 31f18207970..c99f6510a0b 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -23,7 +23,6 @@
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <asm/pgalloc.h>
@@ -42,8 +41,9 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long pte, int huge)
{
+ unsigned long vpn;
struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
- unsigned long vsid, vaddr;
+ unsigned long vsid;
unsigned int psize;
int ssize;
real_pte_t rpte;
@@ -81,12 +81,12 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
vsid = get_vsid(mm->context.id, addr, ssize);
- WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
- vaddr = hpt_va(addr, vsid, ssize);
+ WARN_ON(vsid == 0);
+ vpn = hpt_vpn(addr, vsid, ssize);
rpte = __real_pte(__pte(pte), ptep);
/*
@@ -96,7 +96,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
* and decide to use local invalidates instead...
*/
if (!batch->active) {
- flush_hash_page(vaddr, rpte, psize, ssize, 0);
+ flush_hash_page(vpn, rpte, psize, ssize, 0);
put_cpu_var(ppc64_tlb_batch);
return;
}
@@ -122,7 +122,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
batch->ssize = ssize;
}
batch->pte[i] = rpte;
- batch->vaddr[i] = vaddr;
+ batch->vpn[i] = vpn;
batch->index = ++i;
if (i >= PPC64_TLB_BATCH_NR)
__flush_tlb_pending(batch);
@@ -146,7 +146,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
if (cpumask_equal(mm_cpumask(batch->mm), tmp))
local = 1;
if (i == 1)
- flush_hash_page(batch->vaddr[0], batch->pte[0],
+ flush_hash_page(batch->vpn[0], batch->pte[0],
batch->psize, batch->ssize, local);
else
flush_hash_range(i, local);
@@ -182,14 +182,13 @@ void tlb_flush(struct mmu_gather *tlb)
* since 64K pages may overlap with other bridges when using 64K pages
* with 4K HW pages on IO space.
*
- * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
- * and is implemented for small size rather than speed.
+ * Because of that usage pattern, it is implemented for small size rather
+ * than speed.
*/
-#ifdef CONFIG_HOTPLUG
-
void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
+ int hugepage_shift;
unsigned long flags;
start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -207,7 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_linux_pte(mm->pgd, start);
+ pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
+ &hugepage_shift);
unsigned long pte;
if (ptep == NULL)
@@ -215,10 +215,38 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
pte = pte_val(*ptep);
if (!(pte & _PAGE_HASHPTE))
continue;
- hpte_need_flush(mm, start, ptep, pte, 0);
+ if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
+ hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+ else
+ hpte_need_flush(mm, start, ptep, pte, 0);
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
}
-#endif /* CONFIG_HOTPLUG */
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+ pte_t *pte;
+ pte_t *start_pte;
+ unsigned long flags;
+
+ addr = _ALIGN_DOWN(addr, PMD_SIZE);
+ /* Note: Normally, we should only ever use a batch within a
+ * PTE locked section. This violates the rule, but will work
+ * since we don't actually modify the PTEs, we just flush the
+ * hash while leaving the PTEs intact (including their reference
+ * to being hashed). This is not the most performance oriented
+ * way to do things but is fine for our needs here.
+ */
+ local_irq_save(flags);
+ arch_enter_lazy_mmu_mode();
+ start_pte = pte_offset_map(pmd, addr);
+ for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+ unsigned long pteval = pte_val(*pte);
+ if (pteval & _PAGE_HASHPTE)
+ hpte_need_flush(mm, addr, pte, pteval, 0);
+ addr += PAGE_SIZE;
+ }
+ arch_leave_lazy_mmu_mode();
+ local_irq_restore(flags);
+}
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index ff672bd8fea..356e8b41fb0 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -20,6 +20,8 @@
#include <asm/pgtable.h>
#include <asm/exception-64e.h>
#include <asm/ppc-opcode.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
#ifdef CONFIG_PPC_64K_PAGES
#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1)
@@ -37,36 +39,54 @@
* *
**********************************************************************/
-.macro tlb_prolog_bolted addr
- mtspr SPRN_SPRG_TLB_SCRATCH,r13
+/*
+ * Note that, unlike non-bolted handlers, TLB_EXFRAME is not
+ * modified by the TLB miss handlers themselves, since the TLB miss
+ * handler code will not itself cause a recursive TLB miss.
+ *
+ * TLB_EXFRAME will be modified when crit/mc/debug exceptions are
+ * entered/exited.
+ */
+.macro tlb_prolog_bolted intnum addr
+ mtspr SPRN_SPRG_GEN_SCRATCH,r12
+ mfspr r12,SPRN_SPRG_TLB_EXFRAME
+ std r13,EX_TLB_R13(r12)
+ std r10,EX_TLB_R10(r12)
mfspr r13,SPRN_SPRG_PACA
- std r10,PACA_EXTLB+EX_TLB_R10(r13)
+
mfcr r10
- std r11,PACA_EXTLB+EX_TLB_R11(r13)
- std r16,PACA_EXTLB+EX_TLB_R16(r13)
+ std r11,EX_TLB_R11(r12)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+ mfspr r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+ DO_KVM \intnum, SPRN_SRR1
+ std r16,EX_TLB_R16(r12)
mfspr r16,\addr /* get faulting address */
- std r14,PACA_EXTLB+EX_TLB_R14(r13)
+ std r14,EX_TLB_R14(r12)
ld r14,PACAPGD(r13)
- std r15,PACA_EXTLB+EX_TLB_R15(r13)
- std r10,PACA_EXTLB+EX_TLB_CR(r13)
- TLB_MISS_PROLOG_STATS_BOLTED
+ std r15,EX_TLB_R15(r12)
+ std r10,EX_TLB_CR(r12)
+ TLB_MISS_PROLOG_STATS
.endm
.macro tlb_epilog_bolted
- ld r14,PACA_EXTLB+EX_TLB_CR(r13)
- ld r10,PACA_EXTLB+EX_TLB_R10(r13)
- ld r11,PACA_EXTLB+EX_TLB_R11(r13)
+ ld r14,EX_TLB_CR(r12)
+ ld r10,EX_TLB_R10(r12)
+ ld r11,EX_TLB_R11(r12)
+ ld r13,EX_TLB_R13(r12)
mtcr r14
- ld r14,PACA_EXTLB+EX_TLB_R14(r13)
- ld r15,PACA_EXTLB+EX_TLB_R15(r13)
- TLB_MISS_RESTORE_STATS_BOLTED
- ld r16,PACA_EXTLB+EX_TLB_R16(r13)
- mfspr r13,SPRN_SPRG_TLB_SCRATCH
+ ld r14,EX_TLB_R14(r12)
+ ld r15,EX_TLB_R15(r12)
+ TLB_MISS_RESTORE_STATS
+ ld r16,EX_TLB_R16(r12)
+ mfspr r12,SPRN_SPRG_GEN_SCRATCH
.endm
/* Data TLB miss */
START_EXCEPTION(data_tlb_miss_bolted)
- tlb_prolog_bolted SPRN_DEAR
+ tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
/* We need _PAGE_PRESENT and _PAGE_ACCESSED set */
@@ -126,9 +146,9 @@ BEGIN_MMU_FTR_SECTION
/* Set the TLB reservation and search for existing entry. Then load
* the entry.
*/
- PPC_TLBSRX_DOT(0,r16)
+ PPC_TLBSRX_DOT(0,R16)
ldx r14,r14,r15 /* grab pgd entry */
- beq normal_tlb_miss_done /* tlb exists already, bail */
+ beq tlb_miss_done_bolted /* tlb exists already, bail */
MMU_FTR_SECTION_ELSE
ldx r14,r14,r15 /* grab pgd entry */
ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
@@ -184,6 +204,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
mtspr SPRN_MAS7_MAS3,r15
tlbwe
+tlb_miss_done_bolted:
TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
tlb_epilog_bolted
rfi
@@ -214,7 +235,7 @@ itlb_miss_fault_bolted:
/* Instruction TLB miss */
START_EXCEPTION(instruction_tlb_miss_bolted)
- tlb_prolog_bolted SPRN_SRR0
+ tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
srdi r15,r16,60 /* get region */
@@ -231,6 +252,183 @@ itlb_miss_fault_bolted:
beq tlb_miss_common_bolted
b itlb_miss_kernel_bolted
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/*
+ * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
+ *
+ * Linear mapping is bolted: no virtual page table or nested TLB misses
+ * Indirect entries in TLB1, hardware loads resulting direct entries
+ * into TLB0
+ * No HES or NV hint on TLB1, so we need to do software round-robin
+ * No tlbsrx. so we need a spinlock, and we have to deal
+ * with MAS-damage caused by tlbsx
+ * 4K pages only
+ */
+
+ START_EXCEPTION(instruction_tlb_miss_e6500)
+ tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+ ld r11,PACA_TCD_PTR(r13)
+ srdi. r15,r16,60 /* get region */
+ ori r16,r16,1
+
+ TLB_MISS_STATS_SAVE_INFO_BOLTED
+ bne tlb_miss_kernel_e6500 /* user/kernel test */
+
+ b tlb_miss_common_e6500
+
+ START_EXCEPTION(data_tlb_miss_e6500)
+ tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+ ld r11,PACA_TCD_PTR(r13)
+ srdi. r15,r16,60 /* get region */
+ rldicr r16,r16,0,62
+
+ TLB_MISS_STATS_SAVE_INFO_BOLTED
+ bne tlb_miss_kernel_e6500 /* user vs kernel check */
+
+/*
+ * This is the guts of the TLB miss handler for e6500 and derivatives.
+ * We are entered with:
+ *
+ * r16 = page of faulting address (low bit 0 if data, 1 if instruction)
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = tlb_per_core ptr
+ * r10 = cpu number
+ */
+tlb_miss_common_e6500:
+ /*
+ * Search if we already have an indirect entry for that virtual
+ * address, and if we do, bail out.
+ *
+ * MAS6:IND should be already set based on MAS4
+ */
+1: lbarx r15,0,r11
+ lhz r10,PACAPACAINDEX(r13)
+ cmpdi r15,0
+ cmpdi cr1,r15,1 /* set cr1.eq = 0 for non-recursive */
+ bne 2f
+ stbcx. r10,0,r11
+ bne 1b
+3:
+ .subsection 1
+2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */
+ beq cr1,3b /* unlock will happen if cr1.eq = 0 */
+ lbz r15,0(r11)
+ cmpdi r15,0
+ bne 2b
+ b 1b
+ .previous
+
+ mfspr r15,SPRN_MAS2
+
+ tlbsx 0,r16
+ mfspr r10,SPRN_MAS1
+ andis. r10,r10,MAS1_VALID@h
+ bne tlb_miss_done_e6500
+
+ /* Undo MAS-damage from the tlbsx */
+ mfspr r10,SPRN_MAS1
+ oris r10,r10,MAS1_VALID@h
+ mtspr SPRN_MAS1,r10
+ mtspr SPRN_MAS2,r15
+
+ /* Now, we need to walk the page tables. First check if we are in
+ * range.
+ */
+ rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+ bne- tlb_miss_fault_e6500
+
+ rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+ cmpldi cr0,r14,0
+ clrrdi r15,r15,3
+ beq- tlb_miss_fault_e6500 /* No PGDIR, bail */
+ ldx r14,r14,r15 /* grab pgd entry */
+
+ rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_e6500 /* Bad pgd entry or hugepage; bail */
+ ldx r14,r14,r15 /* grab pud entry */
+
+ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_e6500
+ ldx r14,r14,r15 /* Grab pmd entry */
+
+ mfspr r10,SPRN_MAS0
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_e6500
+
+ /* Now we build the MAS for a 2M indirect page:
+ *
+ * MAS 0 : ESEL needs to be filled by software round-robin
+ * MAS 1 : Fully set up
+ * - PID already updated by caller if necessary
+ * - TSIZE for now is base ind page size always
+ * - TID already cleared if necessary
+ * MAS 2 : Default not 2M-aligned, need to be redone
+ * MAS 3+7 : Needs to be done
+ */
+
+ ori r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+ mtspr SPRN_MAS7_MAS3,r14
+
+ clrrdi r15,r16,21 /* make EA 2M-aligned */
+ mtspr SPRN_MAS2,r15
+
+ lbz r15,TCD_ESEL_NEXT(r11)
+ lbz r16,TCD_ESEL_MAX(r11)
+ lbz r14,TCD_ESEL_FIRST(r11)
+ rlwimi r10,r15,16,0x00ff0000 /* insert esel_next into MAS0 */
+ addi r15,r15,1 /* increment esel_next */
+ mtspr SPRN_MAS0,r10
+ cmpw r15,r16
+ iseleq r15,r14,r15 /* if next == last use first */
+ stb r15,TCD_ESEL_NEXT(r11)
+
+ tlbwe
+
+tlb_miss_done_e6500:
+ .macro tlb_unlock_e6500
+ beq cr1,1f /* no unlock if lock was recursively grabbed */
+ li r15,0
+ isync
+ stb r15,0(r11)
+1:
+ .endm
+
+ tlb_unlock_e6500
+ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+ tlb_epilog_bolted
+ rfi
+
+tlb_miss_kernel_e6500:
+ mfspr r10,SPRN_MAS1
+ ld r14,PACA_KERNELPGD(r13)
+ cmpldi cr0,r15,8 /* Check for vmalloc region */
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+ beq+ tlb_miss_common_e6500
+
+tlb_miss_fault_e6500:
+ tlb_unlock_e6500
+ /* We need to check if it was an instruction miss */
+ andi. r16,r16,1
+ bne itlb_miss_fault_e6500
+dtlb_miss_fault_e6500:
+ TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+ tlb_epilog_bolted
+ b exc_data_storage_book3e
+itlb_miss_fault_e6500:
+ TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+ tlb_epilog_bolted
+ b exc_instruction_storage_book3e
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
/**********************************************************************
* *
* TLB miss handling for Book3E with TLB reservation and HES support *
@@ -395,7 +593,7 @@ BEGIN_MMU_FTR_SECTION
/* Set the TLB reservation and search for existing entry. Then load
* the entry.
*/
- PPC_TLBSRX_DOT(0,r16)
+ PPC_TLBSRX_DOT(0,R16)
ld r14,0(r10)
beq normal_tlb_miss_done
MMU_FTR_SECTION_ELSE
@@ -528,7 +726,7 @@ BEGIN_MMU_FTR_SECTION
/* Search if we already have a TLB entry for that virtual address, and
* if we do, bail out.
*/
- PPC_TLBSRX_DOT(0,r16)
+ PPC_TLBSRX_DOT(0,R16)
beq virt_page_table_tlb_miss_done
END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
@@ -779,7 +977,7 @@ htw_tlb_miss:
*
* MAS1:IND should be already set based on MAS4
*/
- PPC_TLBSRX_DOT(0,r16)
+ PPC_TLBSRX_DOT(0,R16)
beq htw_tlb_miss_done
/* Now, we need to walk the page tables. First check if we are in
@@ -910,7 +1108,8 @@ tlb_load_linear:
ld r11,PACATOC(r13)
ld r11,linear_map_top@got(r11)
ld r10,0(r11)
- cmpld cr0,r10,r16
+ tovirt(10,10)
+ cmpld cr0,r16,r10
bge tlb_load_linear_fault
/* MAS1 need whole new setup. */
@@ -919,7 +1118,7 @@ tlb_load_linear:
mtspr SPRN_MAS1,r15
/* Already somebody there ? */
- PPC_TLBSRX_DOT(0,r16)
+ PPC_TLBSRX_DOT(0,R16)
beq tlb_load_linear_done
/* Now we build the remaining MAS. MAS0 and 2 should be fine
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index df32a838dcf..92cb18d52ea 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -43,6 +43,7 @@
#include <asm/tlb.h>
#include <asm/code-patching.h>
#include <asm/hugetlb.h>
+#include <asm/paca.h>
#include "mmu_decl.h"
@@ -58,6 +59,10 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
.shift = 12,
.enc = BOOK3E_PAGESZ_4K,
},
+ [MMU_PAGE_2M] = {
+ .shift = 21,
+ .enc = BOOK3E_PAGESZ_2M,
+ },
[MMU_PAGE_4M] = {
.shift = 22,
.enc = BOOK3E_PAGESZ_4M,
@@ -136,9 +141,18 @@ static inline int mmu_get_tsize(int psize)
int mmu_linear_psize; /* Page size used for the linear mapping */
int mmu_pte_psize; /* Page size used for PTE pages */
int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
-int book3e_htw_enabled; /* Is HW tablewalk enabled ? */
+int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
unsigned long linear_map_top; /* Top of linear mapping */
+
+/*
+ * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
+ * exceptions. This is used for bolted and e6500 TLB miss handlers which
+ * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
+ * this is set to zero.
+ */
+int extlb_level_exc;
+
#endif /* CONFIG_PPC64 */
#ifdef CONFIG_PPC_FSL_BOOK3E
@@ -305,7 +319,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
#ifdef CONFIG_HUGETLB_PAGE
- if (is_vm_hugetlb_page(vma))
+ if (vma && is_vm_hugetlb_page(vma))
flush_hugetlb_page(vma, vmaddr);
#endif
@@ -377,7 +391,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
{
int tsize = mmu_psize_defs[mmu_pte_psize].enc;
- if (book3e_htw_enabled) {
+ if (book3e_htw_mode != PPC_HTW_NONE) {
unsigned long start = address & PMD_MASK;
unsigned long end = address + PMD_SIZE;
unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
@@ -414,9 +428,9 @@ static void setup_page_sizes(void)
#ifdef CONFIG_PPC_FSL_BOOK3E
unsigned int mmucfg = mfspr(SPRN_MMUCFG);
+ int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
- if (((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) &&
- (mmu_has_feature(MMU_FTR_TYPE_FSL_E))) {
+ if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
unsigned int min_pg, max_pg;
@@ -430,7 +444,7 @@ static void setup_page_sizes(void)
def = &mmu_psize_defs[psize];
shift = def->shift;
- if (shift == 0)
+ if (shift == 0 || shift & 1)
continue;
/* adjust to be in terms of 4^shift Kb */
@@ -440,7 +454,40 @@ static void setup_page_sizes(void)
def->flags |= MMU_PAGE_SIZE_DIRECT;
}
- goto no_indirect;
+ goto out;
+ }
+
+ if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
+ u32 tlb1cfg, tlb1ps;
+
+ tlb0cfg = mfspr(SPRN_TLB0CFG);
+ tlb1cfg = mfspr(SPRN_TLB1CFG);
+ tlb1ps = mfspr(SPRN_TLB1PS);
+ eptcfg = mfspr(SPRN_EPTCFG);
+
+ if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+ book3e_htw_mode = PPC_HTW_E6500;
+
+ /*
+ * We expect 4K subpage size and unrestricted indirect size.
+ * The lack of a restriction on indirect size is a Freescale
+ * extension, indicated by PSn = 0 but SPSn != 0.
+ */
+ if (eptcfg != 2)
+ book3e_htw_mode = PPC_HTW_NONE;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (tlb1ps & (1U << (def->shift - 10))) {
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+ if (book3e_htw_mode && psize == MMU_PAGE_2M)
+ def->flags |= MMU_PAGE_SIZE_INDIRECT;
+ }
+ }
+
+ goto out;
}
#endif
@@ -457,8 +504,11 @@ static void setup_page_sizes(void)
}
/* Indirect page sizes supported ? */
- if ((tlb0cfg & TLBnCFG_IND) == 0)
- goto no_indirect;
+ if ((tlb0cfg & TLBnCFG_IND) == 0 ||
+ (tlb0cfg & TLBnCFG_PT) == 0)
+ goto out;
+
+ book3e_htw_mode = PPC_HTW_IBM;
/* Now, we only deal with one IND page size for each
* direct size. Hopefully all implementations today are
@@ -483,8 +533,8 @@ static void setup_page_sizes(void)
def->ind = ps + 10;
}
}
- no_indirect:
+out:
/* Cleanup array and print summary */
pr_info("MMU: Supported page sizes\n");
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
@@ -504,44 +554,28 @@ static void setup_page_sizes(void)
}
}
-static void __patch_exception(int exc, unsigned long addr)
-{
- extern unsigned int interrupt_base_book3e;
- unsigned int *ibase = &interrupt_base_book3e;
-
- /* Our exceptions vectors start with a NOP and -then- a branch
- * to deal with single stepping from userspace which stops on
- * the second instruction. Thus we need to patch the second
- * instruction of the exception, not the first one
- */
-
- patch_branch(ibase + (exc / 4) + 1, addr, 0);
-}
-
-#define patch_exception(exc, name) do { \
- extern unsigned int name; \
- __patch_exception((exc), (unsigned long)&name); \
-} while (0)
-
static void setup_mmu_htw(void)
{
- /* Check if HW tablewalk is present, and if yes, enable it by:
- *
- * - patching the TLB miss handlers to branch to the
- * one dedicates to it
- *
- * - setting the global book3e_htw_enabled
- */
- unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+ /*
+ * If we want to use HW tablewalk, enable it by patching the TLB miss
+ * handlers to branch to the one dedicated to it.
+ */
- if ((tlb0cfg & TLBnCFG_IND) &&
- (tlb0cfg & TLBnCFG_PT)) {
+ switch (book3e_htw_mode) {
+ case PPC_HTW_IBM:
patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
- book3e_htw_enabled = 1;
+ break;
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ case PPC_HTW_E6500:
+ extlb_level_exc = EX_TLB_SIZE;
+ patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+ patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+ break;
+#endif
}
pr_info("MMU: Book3E HW tablewalk %s\n",
- book3e_htw_enabled ? "enabled" : "not supported");
+ book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
}
/*
@@ -562,8 +596,13 @@ static void __early_init_mmu(int boot_cpu)
/* XXX This should be decided at runtime based on supported
* page sizes in the TLB, but for now let's assume 16M is
* always there and a good fit (which it probably is)
+ *
+ * Freescale booke only supports 4K pages in TLB0, so use that.
*/
- mmu_vmemmap_psize = MMU_PAGE_16M;
+ if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+ mmu_vmemmap_psize = MMU_PAGE_4K;
+ else
+ mmu_vmemmap_psize = MMU_PAGE_16M;
/* XXX This code only checks for TLB 0 capabilities and doesn't
* check what page size combos are supported by the HW. It
@@ -581,8 +620,16 @@ static void __early_init_mmu(int boot_cpu)
/* Set MAS4 based on page table setting */
mas4 = 0x4 << MAS4_WIMGED_SHIFT;
- if (book3e_htw_enabled) {
- mas4 |= mas4 | MAS4_INDD;
+ switch (book3e_htw_mode) {
+ case PPC_HTW_E6500:
+ mas4 |= MAS4_INDD;
+ mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+ mas4 |= MAS4_TLBSELD(1);
+ mmu_pte_psize = MMU_PAGE_2M;
+ break;
+
+ case PPC_HTW_IBM:
+ mas4 |= MAS4_INDD;
#ifdef CONFIG_PPC_64K_PAGES
mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
mmu_pte_psize = MMU_PAGE_256M;
@@ -590,13 +637,16 @@ static void __early_init_mmu(int boot_cpu)
mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
mmu_pte_psize = MMU_PAGE_1M;
#endif
- } else {
+ break;
+
+ case PPC_HTW_NONE:
#ifdef CONFIG_PPC_64K_PAGES
mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
#else
mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
#endif
mmu_pte_psize = mmu_virtual_psize;
+ break;
}
mtspr(SPRN_MAS4, mas4);
@@ -616,8 +666,12 @@ static void __early_init_mmu(int boot_cpu)
/* limit memory so we dont have linear faults */
memblock_enforce_memory_limit(linear_map_top);
- patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
- patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e);
+ if (book3e_htw_mode == PPC_HTW_NONE) {
+ extlb_level_exc = EX_TLB_SIZE;
+ patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
+ patch_exception(0x1e0,
+ exc_instruction_tlb_miss_bolted_book3e);
+ }
}
#endif
@@ -634,7 +688,7 @@ void __init early_init_mmu(void)
__early_init_mmu(1);
}
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
{
__early_init_mmu(0);
}
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 7c63c0ed4f1..43ff3c797fb 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -191,12 +191,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
#ifdef CONFIG_PPC_47x
/*
- * 47x variant of icbt
- */
-# define ICBT(CT,RA,RB) \
- .long 0x7c00002c | ((CT) << 21) | ((RA) << 16) | ((RB) << 11)
-
-/*
* _tlbivax_bcast is only on 47x. We don't bother doing a runtime
* check though, it will blow up soon enough if we mistakenly try
* to use it on a 440.
@@ -208,8 +202,7 @@ _GLOBAL(_tlbivax_bcast)
wrteei 0
mtspr SPRN_MMUCR,r5
isync
-/* tlbivax 0,r3 - use .long to avoid binutils deps */
- .long 0x7c000624 | (r3 << 11)
+ PPC_TLBIVAX(0, R3)
isync
eieio
tlbsync
@@ -227,11 +220,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
bl 2f
2: mflr r6
li r7,32
- ICBT(0,r6,r7) /* touch next cache line */
+ PPC_ICBT(0,R6,R7) /* touch next cache line */
add r6,r6,r7
- ICBT(0,r6,r7) /* touch next cache line */
+ PPC_ICBT(0,R6,R7) /* touch next cache line */
add r6,r6,r7
- ICBT(0,r6,r7) /* touch next cache line */
+ PPC_ICBT(0,R6,R7) /* touch next cache line */
sync
nop
nop
@@ -266,7 +259,7 @@ BEGIN_MMU_FTR_SECTION
andi. r3,r3,MMUCSR0_TLBFI@l
bne 1b
MMU_FTR_SECTION_ELSE
- PPC_TLBILX_ALL(0,0)
+ PPC_TLBILX_ALL(0,R0)
ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
msync
isync
@@ -279,7 +272,7 @@ BEGIN_MMU_FTR_SECTION
wrteei 0
mfspr r4,SPRN_MAS6 /* save MAS6 */
mtspr SPRN_MAS6,r3
- PPC_TLBILX_PID(0,0)
+ PPC_TLBILX_PID(0,R0)
mtspr SPRN_MAS6,r4 /* restore MAS6 */
wrtee r10
MMU_FTR_SECTION_ELSE
@@ -313,7 +306,7 @@ BEGIN_MMU_FTR_SECTION
mtspr SPRN_MAS1,r4
tlbwe
MMU_FTR_SECTION_ELSE
- PPC_TLBILX_VA(0,r3)
+ PPC_TLBILX_VA(0,R3)
ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
msync
isync
@@ -331,7 +324,7 @@ _GLOBAL(_tlbil_pid)
mfmsr r10
wrteei 0
mtspr SPRN_MAS6,r4
- PPC_TLBILX_PID(0,0)
+ PPC_TLBILX_PID(0,R0)
wrtee r10
msync
isync
@@ -343,14 +336,14 @@ _GLOBAL(_tlbil_pid_noind)
ori r4,r4,MAS6_SIND
wrteei 0
mtspr SPRN_MAS6,r4
- PPC_TLBILX_PID(0,0)
+ PPC_TLBILX_PID(0,R0)
wrtee r10
msync
isync
blr
_GLOBAL(_tlbil_all)
- PPC_TLBILX_ALL(0,0)
+ PPC_TLBILX_ALL(0,R0)
msync
isync
blr
@@ -364,7 +357,7 @@ _GLOBAL(_tlbil_va)
beq 1f
rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
- PPC_TLBILX_VA(0,r3)
+ PPC_TLBILX_VA(0,R3)
msync
isync
wrtee r10
@@ -379,7 +372,7 @@ _GLOBAL(_tlbivax_bcast)
beq 1f
rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */
- PPC_TLBIVAX(0,r3)
+ PPC_TLBIVAX(0,R3)
eieio
tlbsync
sync
@@ -409,7 +402,9 @@ _GLOBAL(set_context)
* Load TLBCAM[index] entry in to the L2 CAM MMU
*/
_GLOBAL(loadcam_entry)
- LOAD_REG_ADDR(r4, TLBCAM)
+ mflr r5
+ LOAD_REG_ADDR_PIC(r4, TLBCAM)
+ mtlr r5
mulli r5,r3,TLBCAM_SIZE
add r3,r5,r4
lwz r4,TLBCAM_MAS0(r3)