aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/44x_mmu.c6
-rw-r--r--arch/powerpc/mm/Makefile10
-rw-r--r--arch/powerpc/mm/dma-noncoherent.c4
-rw-r--r--arch/powerpc/mm/fault.c69
-rw-r--r--arch/powerpc/mm/fsl_booke_mmu.c80
-rw-r--r--arch/powerpc/mm/gup.c87
-rw-r--r--arch/powerpc/mm/hash_low_64.S164
-rw-r--r--arch/powerpc/mm/hash_native_64.c285
-rw-r--r--arch/powerpc/mm/hash_utils_64.c478
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c179
-rw-r--r--arch/powerpc/mm/hugetlbpage-book3e.c57
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c39
-rw-r--r--arch/powerpc/mm/hugetlbpage.c435
-rw-r--r--arch/powerpc/mm/icswx.c2
-rw-r--r--arch/powerpc/mm/init_32.c7
-rw-r--r--arch/powerpc/mm/init_64.c84
-rw-r--r--arch/powerpc/mm/mem.c142
-rw-r--r--arch/powerpc/mm/mmap.c (renamed from arch/powerpc/mm/mmap_64.c)2
-rw-r--r--arch/powerpc/mm/mmu_context_hash64.c48
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c27
-rw-r--r--arch/powerpc/mm/mmu_decl.h2
-rw-r--r--arch/powerpc/mm/numa.c556
-rw-r--r--arch/powerpc/mm/pgtable.c30
-rw-r--r--arch/powerpc/mm/pgtable_32.c6
-rw-r--r--arch/powerpc/mm/pgtable_64.c555
-rw-r--r--arch/powerpc/mm/slb.c23
-rw-r--r--arch/powerpc/mm/slb_low.S62
-rw-r--r--arch/powerpc/mm/slice.c225
-rw-r--r--arch/powerpc/mm/subpage-prot.c54
-rw-r--r--arch/powerpc/mm/tlb_hash64.c43
-rw-r--r--arch/powerpc/mm/tlb_low_64e.S227
-rw-r--r--arch/powerpc/mm/tlb_nohash.c150
-rw-r--r--arch/powerpc/mm/tlb_nohash_low.S4
33 files changed, 3090 insertions, 1052 deletions
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 2c9441ee6bb..82b1ff759e2 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -41,7 +41,7 @@ int icache_44x_need_flush;
unsigned long tlb_47x_boltmap[1024/8];
-static void __cpuinit ppc44x_update_tlb_hwater(void)
+static void ppc44x_update_tlb_hwater(void)
{
extern unsigned int tlb_44x_patch_hwater_D[];
extern unsigned int tlb_44x_patch_hwater_I[];
@@ -134,7 +134,7 @@ static void __init ppc47x_update_boltmap(void)
/*
* "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
*/
-static void __cpuinit ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
{
unsigned int rA;
int bolted;
@@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
}
#ifdef CONFIG_SMP
-void __cpuinit mmu_init_secondary(int cpu)
+void mmu_init_secondary(int cpu)
{
unsigned long addr;
unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3787b61f7d2..51230ee6a40 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -4,19 +4,18 @@
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
-ccflags-$(CONFIG_PPC64) := -mno-minimal-toc
+ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-obj-y := fault.o mem.o pgtable.o gup.o \
+obj-y := fault.o mem.o pgtable.o gup.o mmap.o \
init_$(CONFIG_WORD_SIZE).o \
pgtable_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
tlb_nohash_low.o
obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
-obj-$(CONFIG_PPC64) += mmap_64.o
hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \
slb_low.o slb.o stab.o \
- mmap_64.o $(hash64-y)
+ $(hash64-y)
obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o
obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \
tlb_hash$(CONFIG_WORD_SIZE).o \
@@ -28,11 +27,12 @@ obj-$(CONFIG_44x) += 44x_mmu.o
obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
obj-$(CONFIG_PPC_MM_SLICES) += slice.o
-ifeq ($(CONFIG_HUGETLB_PAGE),y)
obj-y += hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 6747eece84a..7b6c1075017 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -287,9 +287,7 @@ void __dma_free_coherent(size_t size, void *vaddr)
pte_clear(&init_mm, addr, ptep);
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
-
- ClearPageReserved(page);
- __free_page(page);
+ __free_reserved_page(page);
}
}
addr += PAGE_SIZE;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 3a8489a354e..51ab9e7e6c3 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -32,6 +32,7 @@
#include <linux/perf_event.h>
#include <linux/magic.h>
#include <linux/ratelimit.h>
+#include <linux/context_tracking.h>
#include <asm/firmware.h>
#include <asm/page.h>
@@ -196,6 +197,7 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
+ enum ctx_state prev_state = exception_enter();
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -204,6 +206,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
int trap = TRAP(regs);
int is_exec = trap == 0x400;
int fault;
+ int rc = 0, store_update_sp = 0;
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
/*
@@ -220,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
is_write = error_code & ESR_DST;
#endif /* CONFIG_4xx || CONFIG_BOOKE */
- if (is_write)
- flags |= FAULT_FLAG_WRITE;
-
#ifdef CONFIG_PPC_ICSWX
/*
* we need to do this early because this "data storage
@@ -230,28 +230,30 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
* look at it
*/
if (error_code & ICSWX_DSI_UCT) {
- int rc = acop_handle_fault(regs, address, error_code);
+ rc = acop_handle_fault(regs, address, error_code);
if (rc)
- return rc;
+ goto bail;
}
#endif /* CONFIG_PPC_ICSWX */
if (notify_page_fault(regs))
- return 0;
+ goto bail;
if (unlikely(debugger_fault_handler(regs)))
- return 0;
+ goto bail;
/* On a kernel SLB miss we can only check for a valid exception entry */
- if (!user_mode(regs) && (address >= TASK_SIZE))
- return SIGSEGV;
+ if (!user_mode(regs) && (address >= TASK_SIZE)) {
+ rc = SIGSEGV;
+ goto bail;
+ }
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
defined(CONFIG_PPC_BOOK3S_64))
if (error_code & DSISR_DABRMATCH) {
- /* DABR match */
- do_dabr(regs, address, error_code);
- return 0;
+ /* breakpoint match */
+ do_break(regs, address, error_code);
+ goto bail;
}
#endif
@@ -260,8 +262,10 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
local_irq_enable();
if (in_atomic() || mm == NULL) {
- if (!user_mode(regs))
- return SIGSEGV;
+ if (!user_mode(regs)) {
+ rc = SIGSEGV;
+ goto bail;
+ }
/* in_atomic() in user mode is really bad,
as is current->mm == NULL. */
printk(KERN_EMERG "Page fault in user mode with "
@@ -273,6 +277,17 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ /*
+ * We want to do this outside mmap_sem, because reading code around nip
+ * can result in fault, which will cause a deadlock when called with
+ * mmap_sem held
+ */
+ if (user_mode(regs))
+ store_update_sp = store_updates_sp(regs);
+
+ if (user_mode(regs))
+ flags |= FAULT_FLAG_USER;
+
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -338,8 +353,7 @@ retry:
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
- if (address + 2048 < uregs->gpr[1]
- && (!user_mode(regs) || !store_updates_sp(regs)))
+ if (address + 2048 < uregs->gpr[1] && !store_update_sp)
goto bad_area;
}
if (expand_stack(vma, address))
@@ -401,6 +415,7 @@ good_area:
} else if (is_write) {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
+ flags |= FAULT_FLAG_WRITE;
/* a read */
} else {
/* protection fault */
@@ -417,9 +432,11 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, flags);
if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
- int rc = mm_fault_error(regs, address, fault);
+ rc = mm_fault_error(regs, address, fault);
if (rc >= MM_FAULT_RETURN)
- return rc;
+ goto bail;
+ else
+ rc = 0;
}
/*
@@ -434,8 +451,12 @@ good_area:
regs, address);
#ifdef CONFIG_PPC_SMLPAR
if (firmware_has_feature(FW_FEATURE_CMO)) {
+ u32 page_ins;
+
preempt_disable();
- get_lppaca()->page_ins += (1 << PAGE_FACTOR);
+ page_ins = be32_to_cpu(get_lppaca()->page_ins);
+ page_ins += 1 << PAGE_FACTOR;
+ get_lppaca()->page_ins = cpu_to_be32(page_ins);
preempt_enable();
}
#endif /* CONFIG_PPC_SMLPAR */
@@ -454,7 +475,7 @@ good_area:
}
up_read(&mm->mmap_sem);
- return 0;
+ goto bail;
bad_area:
up_read(&mm->mmap_sem);
@@ -463,7 +484,7 @@ bad_area_nosemaphore:
/* User mode accesses cause a SIGSEGV */
if (user_mode(regs)) {
_exception(SIGSEGV, regs, code, address);
- return 0;
+ goto bail;
}
if (is_exec && (error_code & DSISR_PROTFAULT))
@@ -471,7 +492,11 @@ bad_area_nosemaphore:
" page (%lx) - exploit attempt? (uid: %d)\n",
address, from_kuid(&init_user_ns, current_uid()));
- return SIGSEGV;
+ rc = SIGSEGV;
+
+bail:
+ exception_exit(prev_state);
+ return rc;
}
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index 07ba45b0f07..94cd728166d 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -52,6 +52,7 @@
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/setup.h>
+#include <asm/paca.h>
#include "mmu_decl.h"
@@ -171,11 +172,10 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
return 1UL << camsize;
}
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
+ unsigned long ram, int max_cam_idx)
{
int i;
- unsigned long virt = PAGE_OFFSET;
- phys_addr_t phys = memstart_addr;
unsigned long amount_mapped = 0;
/* Calculate CAM values */
@@ -192,9 +192,23 @@ unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
}
tlbcam_index = i;
+#ifdef CONFIG_PPC64
+ get_paca()->tcd.esel_next = i;
+ get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+ get_paca()->tcd.esel_first = i;
+#endif
+
return amount_mapped;
}
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+{
+ unsigned long virt = PAGE_OFFSET;
+ phys_addr_t phys = memstart_addr;
+
+ return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx);
+}
+
#ifdef CONFIG_PPC32
#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
@@ -222,7 +236,9 @@ void __init adjust_total_lowmem(void)
/* adjust lowmem size to __max_low_memory */
ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
+ i = switch_to_as1();
__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
+ restore_to_as0(i, 0, 0, 1);
pr_info("Memory CAM mapping: ");
for (i = 0; i < tlbcam_index - 1; i++)
@@ -241,4 +257,62 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
/* 64M mapped initially according to head_fsl_booke.S */
memblock_set_current_limit(min_t(u64, limit, 0x04000000));
}
+
+#ifdef CONFIG_RELOCATABLE
+int __initdata is_second_reloc;
+notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
+{
+ unsigned long base = KERNELBASE;
+
+ kernstart_addr = start;
+ if (is_second_reloc) {
+ virt_phys_offset = PAGE_OFFSET - memstart_addr;
+ return;
+ }
+
+ /*
+ * Relocatable kernel support based on processing of dynamic
+ * relocation entries. Before we get the real memstart_addr,
+ * We will compute the virt_phys_offset like this:
+ * virt_phys_offset = stext.run - kernstart_addr
+ *
+ * stext.run = (KERNELBASE & ~0x3ffffff) +
+ * (kernstart_addr & 0x3ffffff)
+ * When we relocate, we have :
+ *
+ * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff)
+ *
+ * hence:
+ * virt_phys_offset = (KERNELBASE & ~0x3ffffff) -
+ * (kernstart_addr & ~0x3ffffff)
+ *
+ */
+ start &= ~0x3ffffff;
+ base &= ~0x3ffffff;
+ virt_phys_offset = base - start;
+ early_get_first_memblock_info(__va(dt_ptr), NULL);
+ /*
+ * We now get the memstart_addr, then we should check if this
+ * address is the same as what the PAGE_OFFSET map to now. If
+ * not we have to change the map of PAGE_OFFSET to memstart_addr
+ * and do a second relocation.
+ */
+ if (start != memstart_addr) {
+ int n;
+ long offset = start - memstart_addr;
+
+ is_second_reloc = 1;
+ n = switch_to_as1();
+ /* map a 64M area for the second relocation */
+ if (memstart_addr > start)
+ map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM);
+ else
+ map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
+ 0x4000000, CONFIG_LOWMEM_CAM_NUM);
+ restore_to_as0(n, offset, __va(dt_ptr), 1);
+ /* We should never reach here */
+ panic("Relocation error");
+ }
+}
+#endif
#endif
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d7efdbf640c..d8746684f60 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -34,8 +34,13 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
ptep = pte_offset_kernel(&pmd, addr);
do {
- pte_t pte = *ptep;
+ pte_t pte = ACCESS_ONCE(*ptep);
struct page *page;
+ /*
+ * Similar to the PMD case, NUMA hinting must take slow path
+ */
+ if (pte_numa(pte))
+ return 0;
if ((pte_val(pte) & mask) != result)
return 0;
@@ -63,12 +68,30 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmdp = pmd_offset(&pud, addr);
do {
- pmd_t pmd = *pmdp;
+ pmd_t pmd = ACCESS_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
+ /*
+ * If we find a splitting transparent hugepage we
+ * return zero. That will result in taking the slow
+ * path which will call wait_split_huge_page()
+ * if the pmd is still in splitting state
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
- if (is_hugepd(pmdp)) {
+ if (pmd_huge(pmd) || pmd_large(pmd)) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_numa(pmd))
+ return 0;
+
+ if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (is_hugepd(pmdp)) {
if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
addr, next, write, pages, nr))
return 0;
@@ -87,12 +110,16 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
pudp = pud_offset(&pgd, addr);
do {
- pud_t pud = *pudp;
+ pud_t pud = ACCESS_ONCE(*pudp);
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
- if (is_hugepd(pudp)) {
+ if (pud_huge(pud)) {
+ if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (is_hugepd(pudp)) {
if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
addr, next, write, pages, nr))
return 0;
@@ -103,12 +130,13 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
unsigned long next;
+ unsigned long flags;
pgd_t *pgdp;
int nr = 0;
@@ -121,7 +149,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
start, len)))
- goto slow_irqon;
+ return 0;
pr_devel(" aligned: %lx .. %lx\n", start, end);
@@ -142,36 +170,45 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
* So long as we atomically load page table pointers versus teardown,
* we can follow the address down to the the page and take a ref on it.
*/
- local_irq_disable();
+ local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
- pgd_t pgd = *pgdp;
+ pgd_t pgd = ACCESS_ONCE(*pgdp);
pr_devel(" %016lx: normal pgd %p\n", addr,
(void *)pgd_val(pgd));
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
- goto slow;
- if (is_hugepd(pgdp)) {
+ break;
+ if (pgd_huge(pgd)) {
+ if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
+ write, pages, &nr))
+ break;
+ } else if (is_hugepd(pgdp)) {
if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
addr, next, write, pages, &nr))
- goto slow;
+ break;
} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
- goto slow;
+ break;
} while (pgdp++, addr = next, addr != end);
- local_irq_enable();
+ local_irq_restore(flags);
- VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
return nr;
+}
- {
- int ret;
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ int nr, ret;
-slow:
- local_irq_enable();
-slow_irqon:
+ start &= PAGE_MASK;
+ nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ ret = nr;
+
+ if (nr < nr_pages) {
pr_devel(" slow path ! nr = %d\n", nr);
/* Try to get the remaining pages with get_user_pages */
@@ -180,7 +217,7 @@ slow_irqon:
down_read(&mm->mmap_sem);
ret = get_user_pages(current, mm, start,
- (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+ nr_pages - nr, write, 0, pages, NULL);
up_read(&mm->mmap_sem);
/* Have to be a bit careful with return values */
@@ -190,9 +227,9 @@ slow_irqon:
else
ret += nr;
}
-
- return ret;
}
+
+ return ret;
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 56585086413..057cbbb4c57 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -115,11 +115,13 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
sldi r29,r5,SID_SHIFT - VPN_SHIFT
rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
or r29,r28,r29
-
- /* Calculate hash value for primary slot and store it in r28 */
- rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
- rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
- xor r28,r5,r0
+ /*
+ * Calculate hash value for primary slot and store it in r28
+ * r3 = va, r5 = vsid
+ * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
+ */
+ rldicl r0,r3,64-12,48
+ xor r28,r5,r0 /* hash */
b 4f
3: /* Calc vpn and put it in r29 */
@@ -130,11 +132,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
/*
* calculate hash value for primary slot and
* store it in r28 for 1T segment
+ * r3 = va, r5 = vsid
*/
- rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */
- clrldi r5,r5,40 /* vsid & 0xffffff */
- rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */
- xor r28,r28,r5
+ sldi r28,r5,25 /* vsid << 25 */
+ /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
+ rldicl r0,r3,64-12,36
+ xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
xor r28,r28,r0 /* hash */
/* Convert linux PTE bits into HW equivalents */
@@ -145,7 +148,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
andc r0,r30,r0 /* r0 = pte & ~r0 */
rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ ori r3,r3,HPTE_R_C | HPTE_R_M
/* We eventually do the icache sync here (maybe inline that
* code rather than call a C function...)
@@ -153,7 +159,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
BEGIN_FTR_SECTION
mr r4,r30
mr r5,r7
- bl .hash_page_do_lazy_icache
+ bl hash_page_do_lazy_icache
END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* At this point, r3 contains new PP bits, save them in
@@ -193,8 +199,10 @@ htab_insert_pte:
mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
bl . /* Patched by htab_finish_init() */
cmpdi 0,r3,0
bge htab_pte_insert_ok /* Insertion successful */
@@ -216,8 +224,10 @@ _GLOBAL(htab_call_hpte_insert1)
mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
bl . /* Patched by htab_finish_init() */
cmpdi 0,r3,0
bge+ htab_pte_insert_ok /* Insertion successful */
@@ -234,7 +244,8 @@ _GLOBAL(htab_call_hpte_insert2)
2: and r0,r5,r27
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
bl . /* Patched by htab_finish_init() */
/* Try all again */
@@ -284,10 +295,12 @@ htab_modify_pte:
/* Call ppc_md.hpte_updatepp */
mr r5,r29 /* vpn */
- li r6,MMU_PAGE_4K /* page size */
- ld r7,STK_PARAM(R9)(r1) /* segment size */
- ld r8,STK_PARAM(R8)(r1) /* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+ li r6,MMU_PAGE_4K /* base page size */
+ li r7,MMU_PAGE_4K /* actual page size */
+ ld r8,STK_PARAM(R9)(r1) /* segment size */
+ ld r9,STK_PARAM(R8)(r1) /* get "local" param */
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
bl . /* Patched by htab_finish_init() */
/* if we failed because typically the HPTE wasn't really here
@@ -407,11 +420,13 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
*/
rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
or r29,r28,r29
-
- /* Calculate hash value for primary slot and store it in r28 */
- rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
- rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
- xor r28,r5,r0
+ /*
+ * Calculate hash value for primary slot and store it in r28
+ * r3 = va, r5 = vsid
+ * r0 = (va >> 12) & ((1ul << (28 - 12)) -1)
+ */
+ rldicl r0,r3,64-12,48
+ xor r28,r5,r0 /* hash */
b 4f
3: /* Calc vpn and put it in r29 */
@@ -426,11 +441,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
/*
* Calculate hash value for primary slot and
* store it in r28 for 1T segment
+ * r3 = va, r5 = vsid
*/
- rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */
- clrldi r5,r5,40 /* vsid & 0xffffff */
- rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */
- xor r28,r28,r5
+ sldi r28,r5,25 /* vsid << 25 */
+ /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */
+ rldicl r0,r3,64-12,36
+ xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
xor r28,r28,r0 /* hash */
/* Convert linux PTE bits into HW equivalents */
@@ -448,7 +464,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
andc r0,r3,r0 /* r0 = pte & ~r0 */
rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ ori r3,r3,HPTE_R_C | HPTE_R_M
/* We eventually do the icache sync here (maybe inline that
* code rather than call a C function...)
@@ -456,7 +475,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
BEGIN_FTR_SECTION
mr r4,r30
mr r5,r7
- bl .hash_page_do_lazy_icache
+ bl hash_page_do_lazy_icache
END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* At this point, r3 contains new PP bits, save them in
@@ -484,7 +503,7 @@ END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
beq htab_inval_old_hpte
ld r6,STK_PARAM(R6)(r1)
- ori r26,r6,0x8000 /* Load the hidx mask */
+ ori r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */
ld r26,0(r26)
addi r5,r25,36 /* Check actual HPTE_SUB bit, this */
rldcr. r0,r31,r5,0 /* must match pgtable.h definition */
@@ -509,8 +528,10 @@ htab_special_pfn:
mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert1)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert1
+htab_call_hpte_insert1:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge htab_pte_insert_ok /* Insertion successful */
@@ -536,8 +557,10 @@ _GLOBAL(htab_call_hpte_insert1)
mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_4K /* page size */
- ld r9,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(htab_call_hpte_insert2)
+ li r9,MMU_PAGE_4K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl htab_call_hpte_insert2
+htab_call_hpte_insert2:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge+ htab_pte_insert_ok /* Insertion successful */
@@ -554,7 +577,8 @@ _GLOBAL(htab_call_hpte_insert2)
2: and r0,r5,r27
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
+.globl htab_call_hpte_remove
+htab_call_hpte_remove:
bl . /* patched by htab_finish_init() */
/* Try all again */
@@ -571,7 +595,7 @@ htab_inval_old_hpte:
li r6,MMU_PAGE_64K /* psize */
ld r7,STK_PARAM(R9)(r1) /* ssize */
ld r8,STK_PARAM(R8)(r1) /* local */
- bl .flush_hash_page
+ bl flush_hash_page
/* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */
lis r0,_PAGE_HPTE_SUB@h
ori r0,r0,_PAGE_HPTE_SUB@l
@@ -601,7 +625,7 @@ htab_pte_insert_ok:
sld r4,r4,r5
andc r26,r26,r4
or r26,r26,r3
- ori r5,r6,0x8000
+ ori r5,r6,PTE_PAGE_HIDX_OFFSET
std r26,0(r5)
lwsync
std r30,0(r6)
@@ -639,10 +663,12 @@ htab_modify_pte:
/* Call ppc_md.hpte_updatepp */
mr r5,r29 /* vpn */
- li r6,MMU_PAGE_4K /* page size */
- ld r7,STK_PARAM(R9)(r1) /* segment size */
- ld r8,STK_PARAM(R8)(r1) /* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
+ li r6,MMU_PAGE_4K /* base page size */
+ li r7,MMU_PAGE_4K /* actual page size */
+ ld r8,STK_PARAM(R9)(r1) /* segment size */
+ ld r9,STK_PARAM(R8)(r1) /* get "local" param */
+.globl htab_call_hpte_updatepp
+htab_call_hpte_updatepp:
bl . /* patched by htab_finish_init() */
/* if we failed because typically the HPTE wasn't really here
@@ -752,25 +778,27 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT)
or r29,r28,r29
- /* Calculate hash value for primary slot and store it in r28 */
- rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
- rldicl r0,r3,64-16,52 /* (ea >> 16) & 0xfff */
- xor r28,r5,r0
+ /* Calculate hash value for primary slot and store it in r28
+ * r3 = va, r5 = vsid
+ * r0 = (va >> 16) & ((1ul << (28 - 16)) -1)
+ */
+ rldicl r0,r3,64-16,52
+ xor r28,r5,r0 /* hash */
b 4f
3: /* Calc vpn and put it in r29 */
sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT
rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT)
or r29,r28,r29
-
/*
* calculate hash value for primary slot and
* store it in r28 for 1T segment
+ * r3 = va, r5 = vsid
*/
- rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */
- clrldi r5,r5,40 /* vsid & 0xffffff */
- rldicl r0,r3,64-16,40 /* (ea >> 16) & 0xffffff */
- xor r28,r28,r5
+ sldi r28,r5,25 /* vsid << 25 */
+ /* r0 = (va >> 16) & ((1ul << (40 - 16)) -1) */
+ rldicl r0,r3,64-16,40
+ xor r28,r28,r5 /* vsid ^ ( vsid << 25) */
xor r28,r28,r0 /* hash */
/* Convert linux PTE bits into HW equivalents */
@@ -781,7 +809,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
andc r0,r30,r0 /* r0 = pte & ~r0 */
rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
- ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ ori r3,r3,HPTE_R_C | HPTE_R_M
/* We eventually do the icache sync here (maybe inline that
* code rather than call a C function...)
@@ -789,7 +820,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
BEGIN_FTR_SECTION
mr r4,r30
mr r5,r7
- bl .hash_page_do_lazy_icache
+ bl hash_page_do_lazy_icache
END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
/* At this point, r3 contains new PP bits, save them in
@@ -832,8 +863,10 @@ ht64_insert_pte:
mr r4,r29 /* Retrieve vpn */
li r7,0 /* !bolted, !secondary */
li r8,MMU_PAGE_64K
- ld r9,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(ht64_call_hpte_insert1)
+ li r9,MMU_PAGE_64K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl ht64_call_hpte_insert1
+ht64_call_hpte_insert1:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge ht64_pte_insert_ok /* Insertion successful */
@@ -855,8 +888,10 @@ _GLOBAL(ht64_call_hpte_insert1)
mr r4,r29 /* Retrieve vpn */
li r7,HPTE_V_SECONDARY /* !bolted, secondary */
li r8,MMU_PAGE_64K
- ld r9,STK_PARAM(R9)(r1) /* segment size */
-_GLOBAL(ht64_call_hpte_insert2)
+ li r9,MMU_PAGE_64K /* actual page size */
+ ld r10,STK_PARAM(R9)(r1) /* segment size */
+.globl ht64_call_hpte_insert2
+ht64_call_hpte_insert2:
bl . /* patched by htab_finish_init() */
cmpdi 0,r3,0
bge+ ht64_pte_insert_ok /* Insertion successful */
@@ -873,7 +908,8 @@ _GLOBAL(ht64_call_hpte_insert2)
2: and r0,r5,r27
rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
/* Call ppc_md.hpte_remove */
-_GLOBAL(ht64_call_hpte_remove)
+.globl ht64_call_hpte_remove
+ht64_call_hpte_remove:
bl . /* patched by htab_finish_init() */
/* Try all again */
@@ -923,10 +959,12 @@ ht64_modify_pte:
/* Call ppc_md.hpte_updatepp */
mr r5,r29 /* vpn */
- li r6,MMU_PAGE_64K
- ld r7,STK_PARAM(R9)(r1) /* segment size */
- ld r8,STK_PARAM(R8)(r1) /* get "local" param */
-_GLOBAL(ht64_call_hpte_updatepp)
+ li r6,MMU_PAGE_64K /* base page size */
+ li r7,MMU_PAGE_64K /* actual page size */
+ ld r8,STK_PARAM(R9)(r1) /* segment size */
+ ld r9,STK_PARAM(R8)(r1) /* get "local" param */
+.globl ht64_call_hpte_updatepp
+ht64_call_hpte_updatepp:
bl . /* patched by htab_finish_init() */
/* if we failed because typically the HPTE wasn't really here
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index ffc1e00f7a2..cf1d325eae8 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -35,14 +35,19 @@
#define DBG_LOW(fmt...)
#endif
+#ifdef __BIG_ENDIAN__
#define HPTE_LOCK_BIT 3
+#else
+#define HPTE_LOCK_BIT (56+3)
+#endif
DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-static inline void __tlbie(unsigned long vpn, int psize, int ssize)
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
+ unsigned long sllp;
/*
* We need 14 to 65 bits of va for a tlibe of 4K page
@@ -61,17 +66,30 @@ static inline void __tlbie(unsigned long vpn, int psize, int ssize)
switch (psize) {
case MMU_PAGE_4K:
+ /* clear out bits after (52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
+ sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+ ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+ va |= sllp << 5;
asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
: "memory");
break;
default:
/* We need 14 to 14 + i bits of va */
- penc = mmu_psize_defs[psize].penc;
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ penc = mmu_psize_defs[psize].penc[apsize];
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe); /* AVAL */
va |= 1; /* L */
asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
: : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
@@ -80,10 +98,11 @@ static inline void __tlbie(unsigned long vpn, int psize, int ssize)
}
}
-static inline void __tlbiel(unsigned long vpn, int psize, int ssize)
+static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
{
unsigned long va;
unsigned int penc;
+ unsigned long sllp;
/* VPN_SHIFT can be atmost 12 */
va = vpn << VPN_SHIFT;
@@ -96,16 +115,29 @@ static inline void __tlbiel(unsigned long vpn, int psize, int ssize)
switch (psize) {
case MMU_PAGE_4K:
+ /* clear out bits after(52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
va |= ssize << 8;
+ sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+ ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+ va |= sllp << 5;
asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
: : "r"(va) : "memory");
break;
default:
/* We need 14 to 14 + i bits of va */
- penc = mmu_psize_defs[psize].penc;
- va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+ penc = mmu_psize_defs[psize].penc[apsize];
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
va |= penc << 12;
va |= ssize << 8;
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe);
va |= 1; /* L */
asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
: : "r"(va) : "memory");
@@ -114,7 +146,8 @@ static inline void __tlbiel(unsigned long vpn, int psize, int ssize)
}
-static inline void tlbie(unsigned long vpn, int psize, int ssize, int local)
+static inline void tlbie(unsigned long vpn, int psize, int apsize,
+ int ssize, int local)
{
unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL);
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
@@ -125,10 +158,10 @@ static inline void tlbie(unsigned long vpn, int psize, int ssize, int local)
raw_spin_lock(&native_tlbie_lock);
asm volatile("ptesync": : :"memory");
if (use_local) {
- __tlbiel(vpn, psize, ssize);
+ __tlbiel(vpn, psize, apsize, ssize);
asm volatile("ptesync": : :"memory");
} else {
- __tlbie(vpn, psize, ssize);
+ __tlbie(vpn, psize, apsize, ssize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
if (lock_tlbie && !use_local)
@@ -137,7 +170,7 @@ static inline void tlbie(unsigned long vpn, int psize, int ssize, int local)
static inline void native_lock_hpte(struct hash_pte *hptep)
{
- unsigned long *word = &hptep->v;
+ unsigned long *word = (unsigned long *)&hptep->v;
while (1) {
if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
@@ -149,14 +182,14 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
static inline void native_unlock_hpte(struct hash_pte *hptep)
{
- unsigned long *word = &hptep->v;
+ unsigned long *word = (unsigned long *)&hptep->v;
clear_bit_unlock(HPTE_LOCK_BIT, word);
}
static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
unsigned long pa, unsigned long rflags,
- unsigned long vflags, int psize, int ssize)
+ unsigned long vflags, int psize, int apsize, int ssize)
{
struct hash_pte *hptep = htab_address + hpte_group;
unsigned long hpte_v, hpte_r;
@@ -169,10 +202,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
}
for (i = 0; i < HPTES_PER_GROUP; i++) {
- if (! (hptep->v & HPTE_V_VALID)) {
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
/* retry with lock held */
native_lock_hpte(hptep);
- if (! (hptep->v & HPTE_V_VALID))
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
break;
native_unlock_hpte(hptep);
}
@@ -183,22 +216,22 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
if (i == HPTES_PER_GROUP)
return -1;
- hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID;
- hpte_r = hpte_encode_r(pa, psize) | rflags;
+ hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+ hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
i, hpte_v, hpte_r);
}
- hptep->r = hpte_r;
+ hptep->r = cpu_to_be64(hpte_r);
/* Guarantee the second dword is visible before the valid bit */
eieio();
/*
* Now set the first dword including the valid bit
* NOTE: this also unlocks the hpte
*/
- hptep->v = hpte_v;
+ hptep->v = cpu_to_be64(hpte_v);
__asm__ __volatile__ ("ptesync" : : : "memory");
@@ -219,12 +252,12 @@ static long native_hpte_remove(unsigned long hpte_group)
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + hpte_group + slot_offset;
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
/* retry with lock held */
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if ((hpte_v & HPTE_V_VALID)
&& !(hpte_v & HPTE_V_BOLTED))
break;
@@ -245,36 +278,41 @@ static long native_hpte_remove(unsigned long hpte_group)
}
static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
- unsigned long vpn, int psize, int ssize,
- int local)
+ unsigned long vpn, int bpsize,
+ int apsize, int ssize, int local)
{
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v, want_v;
int ret = 0;
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, bpsize, ssize);
DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
vpn, want_v & HPTE_V_AVPN, slot, newpp);
native_lock_hpte(hptep);
- hpte_v = hptep->v;
-
- /* Even if we miss, we need to invalidate the TLB */
+ hpte_v = be64_to_cpu(hptep->v);
+ /*
+ * We need to invalidate the TLB always because hpte_remove doesn't do
+ * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+ * random entry from it. When we do that we don't invalidate the TLB
+ * (hpte_remove) because we assume the old translation is still
+ * technically "valid".
+ */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
DBG_LOW(" -> miss\n");
ret = -1;
} else {
DBG_LOW(" -> hit\n");
/* Update the HPTE */
- hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
- (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & ~(HPTE_R_PP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)));
}
native_unlock_hpte(hptep);
/* Ensure it is out of the tlb too. */
- tlbie(vpn, psize, ssize, local);
+ tlbie(vpn, bpsize, apsize, ssize, local);
return ret;
}
@@ -288,13 +326,13 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
unsigned long want_v, hpte_v;
hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
/* Bolted mappings are only ever in the primary group */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + slot;
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
/* HPTE matches */
@@ -329,15 +367,18 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
hptep = htab_address + slot;
/* Update the HPTE */
- hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
- (newpp & (HPTE_R_PP | HPTE_R_N));
-
- /* Ensure it is out of the tlb too. */
- tlbie(vpn, psize, ssize, 0);
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+ ~(HPTE_R_PP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PP | HPTE_R_N)));
+ /*
+ * Ensure it is out of the tlb too. Bolted entries base and
+ * actual page size will be same.
+ */
+ tlbie(vpn, psize, psize, ssize, 0);
}
static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
- int psize, int ssize, int local)
+ int bpsize, int apsize, int ssize, int local)
{
struct hash_pte *hptep = htab_address + slot;
unsigned long hpte_v;
@@ -348,11 +389,17 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, bpsize, ssize);
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
- /* Even if we miss, we need to invalidate the TLB */
+ /*
+ * We need to invalidate the TLB always because hpte_remove doesn't do
+ * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+ * random entry from it. When we do that we don't invalidate the TLB
+ * (hpte_remove) because we assume the old translation is still
+ * technically "valid".
+ */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
else
@@ -360,47 +407,139 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
hptep->v = 0;
/* Invalidate the TLB */
- tlbie(vpn, psize, ssize, local);
+ tlbie(vpn, bpsize, apsize, ssize, local);
+
+ local_irq_restore(flags);
+}
+
+static void native_hugepage_invalidate(struct mm_struct *mm,
+ unsigned char *hpte_slot_array,
+ unsigned long addr, int psize)
+{
+ int ssize = 0, i;
+ int lock_tlbie;
+ struct hash_pte *hptep;
+ int actual_psize = MMU_PAGE_16M;
+ unsigned int max_hpte_count, valid;
+ unsigned long flags, s_addr = addr;
+ unsigned long hpte_v, want_v, shift;
+ unsigned long hidx, vpn = 0, vsid, hash, slot;
+
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+ local_irq_save(flags);
+ for (i = 0; i < max_hpte_count; i++) {
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_vsid(mm->context.id, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ hptep = htab_address + slot;
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+ native_lock_hpte(hptep);
+ hpte_v = be64_to_cpu(hptep->v);
+
+ /* Even if we miss, we need to invalidate the TLB */
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+ native_unlock_hpte(hptep);
+ else
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ hptep->v = 0;
+ }
+ /*
+ * Since this is a hugepage, we just need a single tlbie.
+ * use the last vpn.
+ */
+ lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+ if (lock_tlbie)
+ raw_spin_lock(&native_tlbie_lock);
+
+ asm volatile("ptesync":::"memory");
+ __tlbie(vpn, psize, actual_psize, ssize);
+ asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+ if (lock_tlbie)
+ raw_spin_unlock(&native_tlbie_lock);
local_irq_restore(flags);
}
-#define LP_SHIFT 12
-#define LP_BITS 8
-#define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT)
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+ int i, shift;
+ unsigned int mask;
+
+ /* start from 1 ignoring MMU_PAGE_4K */
+ for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+ /* invalid penc */
+ if (mmu_psize_defs[psize].penc[i] == -1)
+ continue;
+ /*
+ * encoding bits per actual page size
+ * PTE LP actual page size
+ * rrrr rrrz >=8KB
+ * rrrr rrzz >=16KB
+ * rrrr rzzz >=32KB
+ * rrrr zzzz >=64KB
+ * .......
+ */
+ shift = mmu_psize_defs[i].shift - LP_SHIFT;
+ if (shift > LP_BITS)
+ shift = LP_BITS;
+ mask = (1 << shift) - 1;
+ if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+ return i;
+ }
+ return -1;
+}
static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
- int *psize, int *ssize, unsigned long *vpn)
+ int *psize, int *apsize, int *ssize, unsigned long *vpn)
{
unsigned long avpn, pteg, vpi;
- unsigned long hpte_r = hpte->r;
- unsigned long hpte_v = hpte->v;
+ unsigned long hpte_v = be64_to_cpu(hpte->v);
+ unsigned long hpte_r = be64_to_cpu(hpte->r);
unsigned long vsid, seg_off;
- int i, size, shift, penc;
+ int size, a_size, shift;
+ /* Look at the 8 bit LP value */
+ unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
- if (!(hpte_v & HPTE_V_LARGE))
- size = MMU_PAGE_4K;
- else {
- for (i = 0; i < LP_BITS; i++) {
- if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
- break;
- }
- penc = LP_MASK(i+1) >> LP_SHIFT;
+ if (!(hpte_v & HPTE_V_LARGE)) {
+ size = MMU_PAGE_4K;
+ a_size = MMU_PAGE_4K;
+ } else {
for (size = 0; size < MMU_PAGE_COUNT; size++) {
- /* 4K pages are not represented by LP */
- if (size == MMU_PAGE_4K)
- continue;
-
/* valid entries have a shift value */
if (!mmu_psize_defs[size].shift)
continue;
- if (penc == mmu_psize_defs[size].penc)
+ a_size = __hpte_actual_psize(lp, size);
+ if (a_size != -1)
break;
}
}
-
/* This works for all page sizes, and for 256M and 1T segments */
*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
shift = mmu_psize_defs[size].shift;
@@ -421,6 +560,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
seg_off |= vpi << shift;
}
*vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
case MMU_SEGSIZE_1T:
/* We only have 40 - 23 bits of seg_off in avpn */
seg_off = (avpn & 0x1ffff) << 23;
@@ -430,10 +570,12 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
seg_off |= vpi << shift;
}
*vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
default:
*vpn = size = 0;
}
- *psize = size;
+ *psize = size;
+ *apsize = a_size;
}
/*
@@ -451,7 +593,7 @@ static void native_hpte_clear(void)
struct hash_pte *hptep = htab_address;
unsigned long hpte_v;
unsigned long pteg_count;
- int psize, ssize;
+ int psize, apsize, ssize;
pteg_count = htab_hash_mask + 1;
@@ -470,16 +612,16 @@ static void native_hpte_clear(void)
* running, right? and for crash dump, we probably
* don't want to wait for a maybe bad cpu.
*/
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
/*
* Call __tlbie() here rather than tlbie() since we
* already hold the native_tlbie_lock.
*/
if (hpte_v & HPTE_V_VALID) {
- hpte_decode(hptep, slot, &psize, &ssize, &vpn);
+ hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
hptep->v = 0;
- __tlbie(vpn, psize, ssize);
+ __tlbie(vpn, psize, apsize, ssize);
}
}
@@ -520,9 +662,9 @@ static void native_flush_hash_range(unsigned long number, int local)
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
hptep = htab_address + slot;
- want_v = hpte_encode_v(vpn, psize, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
- hpte_v = hptep->v;
+ hpte_v = be64_to_cpu(hptep->v);
if (!HPTE_V_COMPARE(hpte_v, want_v) ||
!(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
@@ -540,7 +682,7 @@ static void native_flush_hash_range(unsigned long number, int local)
pte_iterate_hashed_subpages(pte, psize,
vpn, index, shift) {
- __tlbiel(vpn, psize, ssize);
+ __tlbiel(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("ptesync":::"memory");
@@ -557,7 +699,7 @@ static void native_flush_hash_range(unsigned long number, int local)
pte_iterate_hashed_subpages(pte, psize,
vpn, index, shift) {
- __tlbie(vpn, psize, ssize);
+ __tlbie(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
asm volatile("eieio; tlbsync; ptesync":::"memory");
@@ -578,4 +720,5 @@ void __init hpte_init_native(void)
ppc_md.hpte_remove = native_hpte_remove;
ppc_md.hpte_clear_all = native_hpte_clear;
ppc_md.flush_hash_range = native_flush_hash_range;
+ ppc_md.hugepage_invalidate = native_hugepage_invalidate;
}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 3a292be2e07..88fdd9d2507 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -33,6 +33,7 @@
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/memblock.h>
+#include <linux/context_tracking.h>
#include <asm/processor.h>
#include <asm/pgtable.h>
@@ -55,6 +56,7 @@
#include <asm/code-patching.h>
#include <asm/fadump.h>
#include <asm/firmware.h>
+#include <asm/tm.h>
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
@@ -125,7 +127,7 @@ static struct mmu_psize_def mmu_psize_defaults_old[] = {
[MMU_PAGE_4K] = {
.shift = 12,
.sllp = 0,
- .penc = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
.avpnm = 0,
.tlbiel = 0,
},
@@ -139,14 +141,15 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
[MMU_PAGE_4K] = {
.shift = 12,
.sllp = 0,
- .penc = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
.avpnm = 0,
.tlbiel = 1,
},
[MMU_PAGE_16M] = {
.shift = 24,
.sllp = SLB_VSID_L,
- .penc = 0,
+ .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
+ [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
.avpnm = 0x1UL,
.tlbiel = 0,
},
@@ -166,9 +169,10 @@ static unsigned long htab_convert_pte_flags(unsigned long pteflags)
if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
(pteflags & _PAGE_DIRTY)))
rflags |= 1;
-
- /* Always add C */
- return rflags | HPTE_R_C;
+ /*
+ * Always add "C" bit for perf. Memory coherence is always enabled
+ */
+ return rflags | HPTE_R_C | HPTE_R_M;
}
int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
@@ -194,16 +198,39 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
unsigned long tprot = prot;
+ /*
+ * If we hit a bad address return error.
+ */
+ if (!vsid)
+ return -1;
/* Make kernel text executable */
if (overlaps_kernel_text(vaddr, vaddr + step))
tprot &= ~HPTE_R_N;
+ /* Make kvm guest trampolines executable */
+ if (overlaps_kvm_tmp(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ /*
+ * If relocatable, check if it overlaps interrupt vectors that
+ * are copied down to real 0. For relocatable kernel
+ * (e.g. kdump case) we copy interrupt vectors down to real
+ * address 0. Mark that region as executable. This is
+ * because on p8 system with relocation on exception feature
+ * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+ * in order to execute the interrupt handlers in virtual
+ * mode the vector region need to be marked as executable.
+ */
+ if ((PHYSICAL_START > MEMORY_START) &&
+ overlaps_interrupt_vector_text(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
hash = hpt_hash(vpn, shift, ssize);
hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
BUG_ON(!ppc_md.hpte_insert);
ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot,
- HPTE_V_BOLTED, psize, ssize);
+ HPTE_V_BOLTED, psize, psize, ssize);
if (ret < 0)
break;
@@ -242,20 +269,19 @@ static int __init htab_dt_scan_seg_sizes(unsigned long node,
const char *uname, int depth,
void *data)
{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
- unsigned long size = 0;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes",
- &size);
+ prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
if (prop == NULL)
return 0;
for (; size >= 4; size -= 4, ++prop) {
- if (prop[0] == 40) {
+ if (be32_to_cpu(prop[0]) == 40) {
DBG("1T segment support detected\n");
cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
return 1;
@@ -270,79 +296,103 @@ static void __init htab_init_seg_sizes(void)
of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
}
+static int __init get_idx_from_shift(unsigned int shift)
+{
+ int idx = -1;
+
+ switch (shift) {
+ case 0xc:
+ idx = MMU_PAGE_4K;
+ break;
+ case 0x10:
+ idx = MMU_PAGE_64K;
+ break;
+ case 0x14:
+ idx = MMU_PAGE_1M;
+ break;
+ case 0x18:
+ idx = MMU_PAGE_16M;
+ break;
+ case 0x22:
+ idx = MMU_PAGE_16G;
+ break;
+ }
+ return idx;
+}
+
static int __init htab_dt_scan_page_sizes(unsigned long node,
const char *uname, int depth,
void *data)
{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
- unsigned long size = 0;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- prop = (u32 *)of_get_flat_dt_prop(node,
- "ibm,segment-page-sizes", &size);
+ prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
if (prop != NULL) {
- DBG("Page sizes from device-tree:\n");
+ pr_info("Page sizes from device-tree:\n");
size /= 4;
cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
while(size > 0) {
- unsigned int shift = prop[0];
- unsigned int slbenc = prop[1];
- unsigned int lpnum = prop[2];
- unsigned int lpenc = 0;
+ unsigned int base_shift = be32_to_cpu(prop[0]);
+ unsigned int slbenc = be32_to_cpu(prop[1]);
+ unsigned int lpnum = be32_to_cpu(prop[2]);
struct mmu_psize_def *def;
- int idx = -1;
+ int idx, base_idx;
size -= 3; prop += 3;
- while(size > 0 && lpnum) {
- if (prop[0] == shift)
- lpenc = prop[1];
- prop += 2; size -= 2;
- lpnum--;
+ base_idx = get_idx_from_shift(base_shift);
+ if (base_idx < 0) {
+ /*
+ * skip the pte encoding also
+ */
+ prop += lpnum * 2; size -= lpnum * 2;
+ continue;
}
- switch(shift) {
- case 0xc:
- idx = MMU_PAGE_4K;
- break;
- case 0x10:
- idx = MMU_PAGE_64K;
- break;
- case 0x14:
- idx = MMU_PAGE_1M;
- break;
- case 0x18:
- idx = MMU_PAGE_16M;
+ def = &mmu_psize_defs[base_idx];
+ if (base_idx == MMU_PAGE_16M)
cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
- break;
- case 0x22:
- idx = MMU_PAGE_16G;
- break;
- }
- if (idx < 0)
- continue;
- def = &mmu_psize_defs[idx];
- def->shift = shift;
- if (shift <= 23)
+
+ def->shift = base_shift;
+ if (base_shift <= 23)
def->avpnm = 0;
else
- def->avpnm = (1 << (shift - 23)) - 1;
+ def->avpnm = (1 << (base_shift - 23)) - 1;
def->sllp = slbenc;
- def->penc = lpenc;
- /* We don't know for sure what's up with tlbiel, so
+ /*
+ * We don't know for sure what's up with tlbiel, so
* for now we only set it for 4K and 64K pages
*/
- if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K)
+ if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
def->tlbiel = 1;
else
def->tlbiel = 0;
- DBG(" %d: shift=%02x, sllp=%04lx, avpnm=%08lx, "
- "tlbiel=%d, penc=%d\n",
- idx, shift, def->sllp, def->avpnm, def->tlbiel,
- def->penc);
+ while (size > 0 && lpnum) {
+ unsigned int shift = be32_to_cpu(prop[0]);
+ int penc = be32_to_cpu(prop[1]);
+
+ prop += 2; size -= 2;
+ lpnum--;
+
+ idx = get_idx_from_shift(shift);
+ if (idx < 0)
+ continue;
+
+ if (penc == -1)
+ pr_err("Invalid penc for base_shift=%d "
+ "shift=%d\n", base_shift, shift);
+
+ def->penc[idx] = penc;
+ pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
+ " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
+ base_shift, shift, def->sllp,
+ def->avpnm, def->tlbiel, def->penc[idx]);
+ }
}
return 1;
}
@@ -356,9 +406,9 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
const char *uname, int depth,
void *data) {
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- unsigned long *addr_prop;
- u32 *page_count_prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be64 *addr_prop;
+ const __be32 *page_count_prop;
unsigned int expected_pages;
long unsigned int phys_addr;
long unsigned int block_size;
@@ -372,12 +422,12 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
if (page_count_prop == NULL)
return 0;
- expected_pages = (1 << page_count_prop[0]);
+ expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
if (addr_prop == NULL)
return 0;
- phys_addr = addr_prop[0];
- block_size = addr_prop[1];
+ phys_addr = be64_to_cpu(addr_prop[0]);
+ block_size = be64_to_cpu(addr_prop[1]);
if (block_size != (16 * GB))
return 0;
printk(KERN_INFO "Huge page(16GB) memory: "
@@ -391,10 +441,39 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
}
#endif /* CONFIG_HUGETLB_PAGE */
+static void mmu_psize_set_default_penc(void)
+{
+ int bpsize, apsize;
+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+ for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
+ mmu_psize_defs[bpsize].penc[apsize] = -1;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+
+static bool might_have_hea(void)
+{
+ /*
+ * The HEA ethernet adapter requires awareness of the
+ * GX bus. Without that awareness we can easily assume
+ * we will never see an HEA ethernet device.
+ */
+#ifdef CONFIG_IBMEBUS
+ return !cpu_has_feature(CPU_FTR_ARCH_207S);
+#else
+ return false;
+#endif
+}
+
+#endif /* #ifdef CONFIG_PPC_64K_PAGES */
+
static void __init htab_init_page_sizes(void)
{
int rc;
+ /* se the invalid penc to -1 */
+ mmu_psize_set_default_penc();
+
/* Default to 4K pages only */
memcpy(mmu_psize_defs, mmu_psize_defaults_old,
sizeof(mmu_psize_defaults_old));
@@ -442,10 +521,11 @@ static void __init htab_init_page_sizes(void)
mmu_linear_psize = MMU_PAGE_64K;
if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
/*
- * Don't use 64k pages for ioremap on pSeries, since
- * that would stop us accessing the HEA ethernet.
+ * When running on pSeries using 64k pages for ioremap
+ * would stop us accessing the HEA ethernet. So if we
+ * have the chance of ever seeing one, stay at 4k.
*/
- if (!machine_is(pseries))
+ if (!might_have_hea() || !machine_is(pseries))
mmu_io_psize = MMU_PAGE_64K;
} else
mmu_ci_restrictions = 1;
@@ -489,17 +569,17 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
const char *uname, int depth,
void *data)
{
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- u32 *prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
return 0;
- prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+ prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
if (prop != NULL) {
/* pft_size[0] is the NUMA CEC cookie */
- ppc64_pft_size = prop[1];
+ ppc64_pft_size = be32_to_cpu(prop[1]);
return 1;
}
return 0;
@@ -546,47 +626,43 @@ int remove_section_mapping(unsigned long start, unsigned long end)
}
#endif /* CONFIG_MEMORY_HOTPLUG */
-#define FUNCTION_TEXT(A) ((*(unsigned long *)(A)))
+extern u32 htab_call_hpte_insert1[];
+extern u32 htab_call_hpte_insert2[];
+extern u32 htab_call_hpte_remove[];
+extern u32 htab_call_hpte_updatepp[];
+extern u32 ht64_call_hpte_insert1[];
+extern u32 ht64_call_hpte_insert2[];
+extern u32 ht64_call_hpte_remove[];
+extern u32 ht64_call_hpte_updatepp[];
static void __init htab_finish_init(void)
{
- extern unsigned int *htab_call_hpte_insert1;
- extern unsigned int *htab_call_hpte_insert2;
- extern unsigned int *htab_call_hpte_remove;
- extern unsigned int *htab_call_hpte_updatepp;
-
#ifdef CONFIG_PPC_HAS_HASH_64K
- extern unsigned int *ht64_call_hpte_insert1;
- extern unsigned int *ht64_call_hpte_insert2;
- extern unsigned int *ht64_call_hpte_remove;
- extern unsigned int *ht64_call_hpte_updatepp;
-
patch_branch(ht64_call_hpte_insert1,
- FUNCTION_TEXT(ppc_md.hpte_insert),
+ ppc_function_entry(ppc_md.hpte_insert),
BRANCH_SET_LINK);
patch_branch(ht64_call_hpte_insert2,
- FUNCTION_TEXT(ppc_md.hpte_insert),
+ ppc_function_entry(ppc_md.hpte_insert),
BRANCH_SET_LINK);
patch_branch(ht64_call_hpte_remove,
- FUNCTION_TEXT(ppc_md.hpte_remove),
+ ppc_function_entry(ppc_md.hpte_remove),
BRANCH_SET_LINK);
patch_branch(ht64_call_hpte_updatepp,
- FUNCTION_TEXT(ppc_md.hpte_updatepp),
+ ppc_function_entry(ppc_md.hpte_updatepp),
BRANCH_SET_LINK);
-
#endif /* CONFIG_PPC_HAS_HASH_64K */
patch_branch(htab_call_hpte_insert1,
- FUNCTION_TEXT(ppc_md.hpte_insert),
+ ppc_function_entry(ppc_md.hpte_insert),
BRANCH_SET_LINK);
patch_branch(htab_call_hpte_insert2,
- FUNCTION_TEXT(ppc_md.hpte_insert),
+ ppc_function_entry(ppc_md.hpte_insert),
BRANCH_SET_LINK);
patch_branch(htab_call_hpte_remove,
- FUNCTION_TEXT(ppc_md.hpte_remove),
+ ppc_function_entry(ppc_md.hpte_remove),
BRANCH_SET_LINK);
patch_branch(htab_call_hpte_updatepp,
- FUNCTION_TEXT(ppc_md.hpte_updatepp),
+ ppc_function_entry(ppc_md.hpte_updatepp),
BRANCH_SET_LINK);
}
@@ -758,10 +834,12 @@ void __init early_init_mmu(void)
/* Initialize stab / SLB management */
if (mmu_has_feature(MMU_FTR_SLB))
slb_initialize();
+ else
+ stab_initialize(get_paca()->stab_real);
}
#ifdef CONFIG_SMP
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
{
/* Initialize hash table for that CPU */
if (!firmware_has_feature(FW_FEATURE_LPAR))
@@ -861,7 +939,7 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea)
if (ea >= spt->maxaddr)
return 0;
- if (ea < 0x100000000) {
+ if (ea < 0x100000000UL) {
/* addresses below 4GB use spt->low_prot */
sbpm = spt->low_prot;
} else {
@@ -891,14 +969,30 @@ static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
void hash_failure_debug(unsigned long ea, unsigned long access,
unsigned long vsid, unsigned long trap,
- int ssize, int psize, unsigned long pte)
+ int ssize, int psize, int lpsize, unsigned long pte)
{
if (!printk_ratelimit())
return;
pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
ea, access, current->comm);
- pr_info(" trap=0x%lx vsid=0x%lx ssize=%d psize=%d pte=0x%lx\n",
- trap, vsid, ssize, psize, pte);
+ pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
+ trap, vsid, ssize, psize, lpsize, pte);
+}
+
+static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
+ int psize, bool user_region)
+{
+ if (user_region) {
+ if (psize != get_paca_psize(ea)) {
+ get_paca()->context = mm->context;
+ slb_flush_and_rebolt();
+ }
+ } else if (get_paca()->vmalloc_sllp !=
+ mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+ get_paca()->vmalloc_sllp =
+ mmu_psize_defs[mmu_vmalloc_psize].sllp;
+ slb_vmalloc_update();
+ }
}
/* Result code is:
@@ -909,6 +1003,7 @@ void hash_failure_debug(unsigned long ea, unsigned long access,
*/
int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
{
+ enum ctx_state prev_state = exception_enter();
pgd_t *pgdir;
unsigned long vsid;
struct mm_struct *mm;
@@ -921,11 +1016,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
ea, access, trap);
- if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) {
- DBG_LOW(" out of pgtable range !\n");
- return 1;
- }
-
/* Get region & vsid */
switch (REGION_ID(ea)) {
case USER_REGION_ID:
@@ -933,7 +1023,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
mm = current->mm;
if (! mm) {
DBG_LOW(" user region with no mm !\n");
- return 1;
+ rc = 1;
+ goto bail;
}
psize = get_slice_psize(mm, ea);
ssize = user_segment_size(ea);
@@ -952,14 +1043,23 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
/* Not a valid range
* Send the problem up to do_page_fault
*/
- return 1;
+ rc = 1;
+ goto bail;
}
DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
+ /* Bad address. */
+ if (!vsid) {
+ DBG_LOW("Bad address!\n");
+ rc = 1;
+ goto bail;
+ }
/* Get pgdir */
pgdir = mm->pgd;
- if (pgdir == NULL)
- return 1;
+ if (pgdir == NULL) {
+ rc = 1;
+ goto bail;
+ }
/* Check CPU locality */
tmp = cpumask_of(smp_processor_id());
@@ -982,7 +1082,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
- return 1;
+ rc = 1;
+ goto bail;
}
/* Add _PAGE_PRESENT to the required access perm */
@@ -993,14 +1094,32 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
*/
if (access & ~pte_val(*ptep)) {
DBG_LOW(" no access !\n");
- return 1;
+ rc = 1;
+ goto bail;
}
+ if (hugeshift) {
+ if (pmd_trans_huge(*(pmd_t *)ptep))
+ rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+ trap, local, ssize, psize);
#ifdef CONFIG_HUGETLB_PAGE
- if (hugeshift)
- return __hash_page_huge(ea, access, vsid, ptep, trap, local,
- ssize, hugeshift, psize);
-#endif /* CONFIG_HUGETLB_PAGE */
+ else
+ rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+ local, ssize, hugeshift, psize);
+#else
+ else {
+ /*
+ * if we have hugeshift, and is not transhuge with
+ * hugetlb disabled, something is really wrong.
+ */
+ rc = 1;
+ WARN_ON(1);
+ }
+#endif
+ check_paca_psize(ea, mm, psize, user_region);
+
+ goto bail;
+ }
#ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
@@ -1039,17 +1158,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
#endif
}
}
- if (user_region) {
- if (psize != get_paca_psize(ea)) {
- get_paca()->context = mm->context;
- slb_flush_and_rebolt();
- }
- } else if (get_paca()->vmalloc_sllp !=
- mmu_psize_defs[mmu_vmalloc_psize].sllp) {
- get_paca()->vmalloc_sllp =
- mmu_psize_defs[mmu_vmalloc_psize].sllp;
- slb_vmalloc_update();
- }
+
+ check_paca_psize(ea, mm, psize, user_region);
#endif /* CONFIG_PPC_64K_PAGES */
#ifdef CONFIG_PPC_HAS_HASH_64K
@@ -1071,7 +1181,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
*/
if (rc == -1)
hash_failure_debug(ea, access, vsid, trap, ssize, psize,
- pte_val(*ptep));
+ psize, pte_val(*ptep));
#ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
#else
@@ -1079,6 +1189,9 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
pte_val(*(ptep + PTRS_PER_PTE)));
#endif
DBG_LOW(" -> rc=%d\n", rc);
+
+bail:
+ exception_exit(prev_state);
return rc;
}
EXPORT_SYMBOL_GPL(hash_page);
@@ -1086,6 +1199,7 @@ EXPORT_SYMBOL_GPL(hash_page);
void hash_preload(struct mm_struct *mm, unsigned long ea,
unsigned long access, unsigned long trap)
{
+ int hugepage_shift;
unsigned long vsid;
pgd_t *pgdir;
pte_t *ptep;
@@ -1107,10 +1221,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
pgdir = mm->pgd;
if (pgdir == NULL)
return;
- ptep = find_linux_pte(pgdir, ea);
- if (!ptep)
+
+ /* Get VSID */
+ ssize = user_segment_size(ea);
+ vsid = get_vsid(mm->context.id, ea, ssize);
+ if (!vsid)
return;
+ /*
+ * Hash doesn't like irqs. Walking linux page table with irq disabled
+ * saves us from holding multiple locks.
+ */
+ local_irq_save(flags);
+ /*
+ * THP pages use update_mmu_cache_pmd. We don't do
+ * hash preload there. Hence can ignore THP here
+ */
+ ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
+ if (!ptep)
+ goto out_exit;
+
+ WARN_ON(hugepage_shift);
#ifdef CONFIG_PPC_64K_PAGES
/* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
* a 64K kernel), then we don't preload, hash_page() will take
@@ -1119,16 +1250,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
* page size demotion here
*/
if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
- return;
+ goto out_exit;
#endif /* CONFIG_PPC_64K_PAGES */
- /* Get VSID */
- ssize = user_segment_size(ea);
- vsid = get_vsid(mm->context.id, ea, ssize);
-
- /* Hash doesn't like irqs */
- local_irq_save(flags);
-
/* Is that local to this CPU ? */
if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
local = 1;
@@ -1147,8 +1271,10 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
*/
if (rc == -1)
hash_failure_debug(ea, access, vsid, trap, ssize,
- mm->context.user_psize, pte_val(*ptep));
-
+ mm->context.user_psize,
+ mm->context.user_psize,
+ pte_val(*ptep));
+out_exit:
local_irq_restore(flags);
}
@@ -1169,8 +1295,28 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
- ppc_md.hpte_invalidate(slot, vpn, psize, ssize, local);
+ /*
+ * We use same base page size and actual psize, because we don't
+ * use these functions for hugepage
+ */
+ ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
} pte_iterate_hashed_end();
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ /* Transactions are not aborted by tlbiel, only tlbie.
+ * Without, syncing a page back to a block device w/ PIO could pick up
+ * transactional data (bad!) so we force an abort here. Before the
+ * sync the page will be made read-only, which will flush_hash_page.
+ * BIG ISSUE here: if the kernel uses a page from userspace without
+ * unmapping it first, it may see the speculated version.
+ */
+ if (local && cpu_has_feature(CPU_FTR_TM) &&
+ current->thread.regs &&
+ MSR_TM_ACTIVE(current->thread.regs->msr)) {
+ tm_enable();
+ tm_abort(TM_CAUSE_TLBI);
+ }
+#endif
}
void flush_hash_range(unsigned long number, int local)
@@ -1194,6 +1340,8 @@ void flush_hash_range(unsigned long number, int local)
*/
void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
{
+ enum ctx_state prev_state = exception_enter();
+
if (user_mode(regs)) {
#ifdef CONFIG_PPC_SUBPAGE_PROT
if (rc == -2)
@@ -1203,23 +1351,64 @@ void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
_exception(SIGBUS, regs, BUS_ADRERR, address);
} else
bad_page_fault(regs, address, SIGBUS);
+
+ exception_exit(prev_state);
+}
+
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+ unsigned long pa, unsigned long rflags,
+ unsigned long vflags, int psize, int ssize)
+{
+ unsigned long hpte_group;
+ long slot;
+
+repeat:
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+
+ /* Insert into the hash table, primary slot */
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
+ psize, psize, ssize);
+
+ /* Primary is full, try the secondary */
+ if (unlikely(slot == -1)) {
+ hpte_group = ((~hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
+ vflags | HPTE_V_SECONDARY,
+ psize, psize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP)&~0x7UL;
+
+ ppc_md.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+
+ return slot;
}
#ifdef CONFIG_DEBUG_PAGEALLOC
static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
{
- unsigned long hash, hpteg;
+ unsigned long hash;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
- int ret;
+ long ret;
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
- hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
- ret = ppc_md.hpte_insert(hpteg, vpn, __pa(vaddr),
- mode, HPTE_V_BOLTED,
- mmu_linear_psize, mmu_kernel_ssize);
+ /* Don't create HPTE entries for bad address */
+ if (!vsid)
+ return;
+
+ ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+ HPTE_V_BOLTED,
+ mmu_linear_psize, mmu_kernel_ssize);
+
BUG_ON (ret < 0);
spin_lock(&linear_map_hash_lock);
BUG_ON(linear_map_hash_slots[lmi] & 0x80);
@@ -1243,7 +1432,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
- ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_kernel_ssize, 0);
+ ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
+ mmu_kernel_ssize, 0);
}
void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 00000000000..826893fcb3a
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+ pmd_t *pmdp, unsigned long trap, int local, int ssize,
+ unsigned int psize)
+{
+ unsigned int index, valid;
+ unsigned char *hpte_slot_array;
+ unsigned long rflags, pa, hidx;
+ unsigned long old_pmd, new_pmd;
+ int ret, lpsize = MMU_PAGE_16M;
+ unsigned long vpn, hash, shift, slot;
+
+ /*
+ * atomically mark the linux large page PMD busy and dirty
+ */
+ do {
+ old_pmd = pmd_val(*pmdp);
+ /* If PMD busy, retry the access */
+ if (unlikely(old_pmd & _PAGE_BUSY))
+ return 0;
+ /* If PMD is trans splitting retry the access */
+ if (unlikely(old_pmd & _PAGE_SPLITTING))
+ return 0;
+ /* If PMD permissions don't match, take page fault */
+ if (unlikely(access & ~old_pmd))
+ return 1;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access
+ */
+ new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_RW)
+ new_pmd |= _PAGE_DIRTY;
+ } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
+ old_pmd, new_pmd));
+ /*
+ * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+ * need to add in 0x1 if it's a read-only user page
+ */
+ rflags = new_pmd & _PAGE_USER;
+ if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
+ (new_pmd & _PAGE_DIRTY)))
+ rflags |= 0x1;
+ /*
+ * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+ */
+ rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
+
+#if 0
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+ }
+#endif
+ /*
+ * Find the slot index details for this ea, using base page size.
+ */
+ shift = mmu_psize_defs[psize].shift;
+ index = (ea & ~HPAGE_PMD_MASK) >> shift;
+ BUG_ON(index >= 4096);
+
+ vpn = hpt_vpn(ea, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ hpte_slot_array = get_hpte_slot_array(pmdp);
+
+ valid = hpte_valid(hpte_slot_array, index);
+ if (valid) {
+ /* update the hpte bits */
+ hidx = hpte_hash_index(hpte_slot_array, index);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+ psize, lpsize, ssize, local);
+ /*
+ * We failed to update, try to insert a new entry.
+ */
+ if (ret == -1) {
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ valid = 0;
+ new_pmd &= ~_PAGE_HPTEFLAGS;
+ hpte_slot_array[index] = 0;
+ } else
+ /* clear the busy bits and set the hash pte bits */
+ new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+ }
+
+ if (!valid) {
+ unsigned long hpte_group;
+
+ /* insert new entry */
+ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+repeat:
+ hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+ /* clear the busy bits and set the hash pte bits */
+ new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+
+ /* Add in WIMG bits */
+ rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+ _PAGE_GUARDED));
+ /*
+ * enable the memory coherence always
+ */
+ rflags |= HPTE_R_M;
+
+ /* Insert into the hash table, primary slot */
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ psize, lpsize, ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ hpte_group = ((~hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+ slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+ rflags, HPTE_V_SECONDARY,
+ psize, lpsize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = ((hash & htab_hash_mask) *
+ HPTES_PER_GROUP) & ~0x7UL;
+
+ ppc_md.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pmd and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *pmdp = __pmd(old_pmd);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ psize, lpsize, old_pmd);
+ return -1;
+ }
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ mark_hpte_slot_valid(hpte_slot_array, index, slot);
+ }
+ /*
+ * No need to use ldarx/stdcx here
+ */
+ *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+ return 0;
+}
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 3bc700655fc..5e4ee257390 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -8,6 +8,44 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC64
+static inline int tlb1_next(void)
+{
+ struct paca_struct *paca = get_paca();
+ struct tlb_core_data *tcd;
+ int this, next;
+
+ tcd = paca->tcd_ptr;
+ this = tcd->esel_next;
+
+ next = this + 1;
+ if (next >= tcd->esel_max)
+ next = tcd->esel_first;
+
+ tcd->esel_next = next;
+ return this;
+}
+#else
+static inline int tlb1_next(void)
+{
+ int index, ncams;
+
+ ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+ index = __get_cpu_var(next_tlbcam_idx);
+
+ /* Just round-robin the entries and wrap when we hit the end */
+ if (unlikely(index == ncams - 1))
+ __get_cpu_var(next_tlbcam_idx) = tlbcam_index;
+ else
+ __get_cpu_var(next_tlbcam_idx)++;
+
+ return index;
+}
+#endif /* !PPC64 */
+#endif /* FSL */
+
static inline int mmu_get_tsize(int psize)
{
return mmu_psize_defs[psize].enc;
@@ -47,7 +85,7 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
struct mm_struct *mm;
#ifdef CONFIG_PPC_FSL_BOOK3E
- int index, ncams;
+ int index;
#endif
if (unlikely(is_kernel_addr(ea)))
@@ -77,18 +115,11 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
}
#ifdef CONFIG_PPC_FSL_BOOK3E
- ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
-
/* We have to use the CAM(TLB1) on FSL parts for hugepages */
- index = __get_cpu_var(next_tlbcam_idx);
+ index = tlb1_next();
mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
-
- /* Just round-robin the entries and wrap when we hit the end */
- if (unlikely(index == ncams - 1))
- __get_cpu_var(next_tlbcam_idx) = tlbcam_index;
- else
- __get_cpu_var(next_tlbcam_idx)++;
#endif
+
mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
mas2 = ea & ~((1UL << shift) - 1);
mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
@@ -103,7 +134,8 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
mtspr(SPRN_MAS7_MAS3, mas7_3);
} else {
- mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+ if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+ mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
}
@@ -117,6 +149,5 @@ void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
struct hstate *hstate = hstate_file(vma->vm_file);
unsigned long tsize = huge_page_shift(hstate) - 10;
- __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0);
-
+ __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0);
}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index cecad348f60..a5bcf930119 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -14,6 +14,10 @@
#include <asm/cacheflush.h>
#include <asm/machdep.h>
+extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+ unsigned long pa, unsigned long rlags,
+ unsigned long vflags, int psize, int ssize);
+
int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, int local, int ssize,
unsigned int shift, unsigned int mmu_psize)
@@ -77,20 +81,15 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
slot += (old_pte & _PAGE_F_GIX) >> 12;
if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
- ssize, local) == -1)
+ mmu_psize, ssize, local) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
if (likely(!(old_pte & _PAGE_HASHPTE))) {
unsigned long hash = hpt_hash(vpn, shift, ssize);
- unsigned long hpte_group;
pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-repeat:
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
-
/* clear HPTE slot informations in new PTE */
#ifdef CONFIG_PPC_64K_PAGES
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
@@ -100,27 +99,13 @@ repeat:
/* Add in WIMG bits */
rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
_PAGE_COHERENT | _PAGE_GUARDED));
+ /*
+ * enable the memory coherence always
+ */
+ rflags |= HPTE_R_M;
- /* Insert into the hash table, primary slot */
- slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
- mmu_psize, ssize);
-
- /* Primary is full, try the secondary */
- if (unlikely(slot == -1)) {
- hpte_group = ((~hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
- slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
- HPTE_V_SECONDARY,
- mmu_psize, ssize);
- if (slot == -1) {
- if (mftb() & 0x1)
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP)&~0x7UL;
-
- ppc_md.hpte_remove(hpte_group);
- goto repeat;
- }
- }
+ slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
+ mmu_psize, ssize);
/*
* Hypervisor failure. Restore old pte and return -1
@@ -129,7 +114,7 @@ repeat:
if (unlikely(slot == -2)) {
*ptep = __pte(old_pte);
hash_failure_debug(ea, access, vsid, trap, ssize,
- mmu_psize, old_pte);
+ mmu_psize, mmu_psize, old_pte);
return -1;
}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1a6de0a7d8e..7e70ae968e5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -21,6 +21,9 @@
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/setup.h>
+#include <asm/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE
#define PAGE_SHIFT_64K 16
#define PAGE_SHIFT_16M 24
@@ -48,66 +51,61 @@ static u64 gpage_freearray[MAX_NUMBER_GPAGES];
static unsigned nr_gpages;
#endif
-static inline int shift_to_mmu_psize(unsigned int shift)
-{
- int psize;
+#define hugepd_none(hpd) ((hpd).pd == 0)
- for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
- if (mmu_psize_defs[psize].shift == shift)
- return psize;
- return -1;
-}
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
-static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
+/*
+ * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
+ * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
+ */
+int pmd_huge(pmd_t pmd)
{
- if (mmu_psize_defs[mmu_psize].shift)
- return mmu_psize_defs[mmu_psize].shift;
- BUG();
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pmd_val(pmd) & 0x3) != 0x0);
}
-#define hugepd_none(hpd) ((hpd).pd == 0)
-
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+int pud_huge(pud_t pud)
{
- pgd_t *pg;
- pud_t *pu;
- pmd_t *pm;
- hugepd_t *hpdp = NULL;
- unsigned pdshift = PGDIR_SHIFT;
-
- if (shift)
- *shift = 0;
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pud_val(pud) & 0x3) != 0x0);
+}
- pg = pgdir + pgd_index(ea);
- if (is_hugepd(pg)) {
- hpdp = (hugepd_t *)pg;
- } else if (!pgd_none(*pg)) {
- pdshift = PUD_SHIFT;
- pu = pud_offset(pg, ea);
- if (is_hugepd(pu))
- hpdp = (hugepd_t *)pu;
- else if (!pud_none(*pu)) {
- pdshift = PMD_SHIFT;
- pm = pmd_offset(pu, ea);
- if (is_hugepd(pm))
- hpdp = (hugepd_t *)pm;
- else if (!pmd_none(*pm)) {
- return pte_offset_kernel(pm, ea);
- }
- }
- }
+int pgd_huge(pgd_t pgd)
+{
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pgd_val(pgd) & 0x3) != 0x0);
+}
+#else
+int pmd_huge(pmd_t pmd)
+{
+ return 0;
+}
- if (!hpdp)
- return NULL;
+int pud_huge(pud_t pud)
+{
+ return 0;
+}
- if (shift)
- *shift = hugepd_shift(*hpdp);
- return hugepte_offset(hpdp, ea, pdshift);
+int pgd_huge(pgd_t pgd)
+{
+ return 0;
}
-EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+#endif
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
+ /* Only called for hugetlbfs pages, hence can ignore THP */
return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
}
@@ -145,6 +143,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
if (unlikely(!hugepd_none(*hpdp)))
break;
else
+ /* We use the old format for PPC_FSL_BOOK3E */
hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
}
/* If we bailed from the for loop early, an error occurred, clean up */
@@ -156,9 +155,15 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
#else
if (!hugepd_none(*hpdp))
kmem_cache_free(cachep, new);
- else
+ else {
+#ifdef CONFIG_PPC_BOOK3S_64
+ hpdp->pd = (unsigned long)new |
+ (shift_to_mmu_psize(pshift) << 2);
+#else
hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
#endif
+ }
+#endif
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -175,6 +180,61 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+{
+ pgd_t *pg;
+ pud_t *pu;
+ pmd_t *pm;
+ hugepd_t *hpdp = NULL;
+ unsigned pshift = __ffs(sz);
+ unsigned pdshift = PGDIR_SHIFT;
+
+ addr &= ~(sz-1);
+ pg = pgd_offset(mm, addr);
+
+ if (pshift == PGDIR_SHIFT)
+ /* 16GB huge page */
+ return (pte_t *) pg;
+ else if (pshift > PUD_SHIFT)
+ /*
+ * We need to use hugepd table
+ */
+ hpdp = (hugepd_t *)pg;
+ else {
+ pdshift = PUD_SHIFT;
+ pu = pud_alloc(mm, pg, addr);
+ if (pshift == PUD_SHIFT)
+ return (pte_t *)pu;
+ else if (pshift > PMD_SHIFT)
+ hpdp = (hugepd_t *)pu;
+ else {
+ pdshift = PMD_SHIFT;
+ pm = pmd_alloc(mm, pu, addr);
+ if (pshift == PMD_SHIFT)
+ /* 16MB hugepage */
+ return (pte_t *)pm;
+ else
+ hpdp = (hugepd_t *)pm;
+ }
+ }
+ if (!hpdp)
+ return NULL;
+
+ BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+ if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+ return NULL;
+
+ return hugepte_offset(hpdp, addr, pdshift);
+}
+
+#else
+
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
pgd_t *pg;
@@ -212,6 +272,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
return hugepte_offset(hpdp, addr, pdshift);
}
+#endif
#ifdef CONFIG_PPC_FSL_BOOK3E
/* Build list of addresses of gigantic pages. This function is used in early
@@ -240,7 +301,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
int alloc_bootmem_huge_page(struct hstate *hstate)
{
struct huge_bootmem_page *m;
- int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
+ int idx = shift_to_mmu_psize(huge_page_shift(hstate));
int nr_gpages = gpage_freearray[idx].nr_gpages;
if (nr_gpages == 0)
@@ -401,12 +462,13 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
{
struct hugepd_freelist **batchp;
- batchp = &__get_cpu_var(hugepd_freelist_cur);
+ batchp = &get_cpu_var(hugepd_freelist_cur);
if (atomic_read(&tlb->mm->mm_users) < 2 ||
cpumask_equal(mm_cpumask(tlb->mm),
cpumask_of(smp_processor_id()))) {
kmem_cache_free(hugepte_cache, hugepte);
+ put_cpu_var(hugepd_freelist_cur);
return;
}
@@ -420,6 +482,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
*batchp = NULL;
}
+ put_cpu_var(hugepd_freelist_cur);
}
#endif
@@ -475,8 +538,14 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
do {
pmd = pmd_offset(pud, addr);
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd))
+ if (!is_hugepd(pmd)) {
+ /*
+ * if it is not hugepd pointer, we should already find
+ * it cleared.
+ */
+ WARN_ON(!pmd_none_or_clear_bad(pmd));
continue;
+ }
#ifdef CONFIG_PPC_FSL_BOOK3E
/*
* Increment next by the size of the huge mapping since
@@ -556,8 +625,6 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
/*
* This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
*/
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -613,11 +680,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
struct page *page;
unsigned shift;
unsigned long mask;
-
+ /*
+ * Transparent hugepages are handled by generic code. We can skip them
+ * here.
+ */
ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
/* Verify it is a huge page else bail. */
- if (!ptep || !shift)
+ if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
return ERR_PTR(-EINVAL);
mask = (1UL << shift) - 1;
@@ -628,16 +698,6 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
return page;
}
-int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
-
-int pud_huge(pud_t pud)
-{
- return 0;
-}
-
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
@@ -646,69 +706,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
return NULL;
}
-static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- unsigned long mask;
- unsigned long pte_end;
- struct page *head, *page, *tail;
- pte_t pte;
- int refs;
-
- pte_end = (addr + sz) & ~(sz-1);
- if (pte_end < end)
- end = pte_end;
-
- pte = *ptep;
- mask = _PAGE_PRESENT | _PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
-
- if ((pte_val(pte) & mask) != mask)
- return 0;
-
- /* hugepages are never "special" */
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
- refs = 0;
- head = pte_page(pte);
-
- page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- tail = page;
- do {
- VM_BUG_ON(compound_head(page) != head);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
-
- if (!page_cache_add_speculative(head, refs)) {
- *nr -= refs;
- return 0;
- }
-
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- /* Could be optimized better */
- *nr -= refs;
- while (refs--)
- put_page(head);
- return 0;
- }
-
- /*
- * Any tail page need their mapcount reference taken before we
- * return.
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
- return 1;
-}
-
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
{
@@ -742,7 +739,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct hstate *hstate = hstate_file(file);
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
- return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
+ return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
}
#endif
@@ -883,11 +880,16 @@ static int __init hugetlbpage_init(void)
pdshift = PUD_SHIFT;
else
pdshift = PGDIR_SHIFT;
-
- pgtable_cache_add(pdshift - shift, NULL);
- if (!PGT_CACHE(pdshift - shift))
- panic("hugetlbpage_init(): could not create "
- "pgtable cache for %d bit pagesize\n", shift);
+ /*
+ * if we have pdshift and shift value same, we don't
+ * use pgt cache for hugepd.
+ */
+ if (pdshift != shift) {
+ pgtable_cache_add(pdshift - shift, NULL);
+ if (!PGT_CACHE(pdshift - shift))
+ panic("hugetlbpage_init(): could not create "
+ "pgtable cache for %d bit pagesize\n", shift);
+ }
}
/* Set default large page size. Currently, we pick 16M or 1M
@@ -920,3 +922,168 @@ void flush_dcache_icache_hugepage(struct page *page)
}
}
}
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page, bottom two bits != 00
+ * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+{
+ pgd_t pgd, *pgdp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t *ret_pte;
+ hugepd_t *hpdp = NULL;
+ unsigned pdshift = PGDIR_SHIFT;
+
+ if (shift)
+ *shift = 0;
+
+ pgdp = pgdir + pgd_index(ea);
+ pgd = ACCESS_ONCE(*pgdp);
+ /*
+ * Always operate on the local stack value. This make sure the
+ * value don't get updated by a parallel THP split/collapse,
+ * page fault or a page unmap. The return pte_t * is still not
+ * stable. So should be checked there for above conditions.
+ */
+ if (pgd_none(pgd))
+ return NULL;
+ else if (pgd_huge(pgd)) {
+ ret_pte = (pte_t *) pgdp;
+ goto out;
+ } else if (is_hugepd(&pgd))
+ hpdp = (hugepd_t *)&pgd;
+ else {
+ /*
+ * Even if we end up with an unmap, the pgtable will not
+ * be freed, because we do an rcu free and here we are
+ * irq disabled
+ */
+ pdshift = PUD_SHIFT;
+ pudp = pud_offset(&pgd, ea);
+ pud = ACCESS_ONCE(*pudp);
+
+ if (pud_none(pud))
+ return NULL;
+ else if (pud_huge(pud)) {
+ ret_pte = (pte_t *) pudp;
+ goto out;
+ } else if (is_hugepd(&pud))
+ hpdp = (hugepd_t *)&pud;
+ else {
+ pdshift = PMD_SHIFT;
+ pmdp = pmd_offset(&pud, ea);
+ pmd = ACCESS_ONCE(*pmdp);
+ /*
+ * A hugepage collapse is captured by pmd_none, because
+ * it mark the pmd none and do a hpte invalidate.
+ *
+ * A hugepage split is captured by pmd_trans_splitting
+ * because we mark the pmd trans splitting and do a
+ * hpte invalidate
+ *
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ return NULL;
+
+ if (pmd_huge(pmd) || pmd_large(pmd)) {
+ ret_pte = (pte_t *) pmdp;
+ goto out;
+ } else if (is_hugepd(&pmd))
+ hpdp = (hugepd_t *)&pmd;
+ else
+ return pte_offset_kernel(&pmd, ea);
+ }
+ }
+ if (!hpdp)
+ return NULL;
+
+ ret_pte = hugepte_offset(hpdp, ea, pdshift);
+ pdshift = hugepd_shift(*hpdp);
+out:
+ if (shift)
+ *shift = pdshift;
+ return ret_pte;
+}
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+
+int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ unsigned long pte_end;
+ struct page *head, *page, *tail;
+ pte_t pte;
+ int refs;
+
+ pte_end = (addr + sz) & ~(sz-1);
+ if (pte_end < end)
+ end = pte_end;
+
+ pte = ACCESS_ONCE(*ptep);
+ mask = _PAGE_PRESENT | _PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * check for splitting here
+ */
+ if (pmd_trans_splitting(pte_pmd(pte)))
+ return 0;
+#endif
+
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+
+ page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+ tail = page;
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ /* Could be optimized better */
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ /*
+ * Any tail page need their mapcount reference taken before we
+ * return.
+ */
+ while (refs--) {
+ if (PageTail(tail))
+ get_huge_page_tail(tail);
+ tail++;
+ }
+
+ return 1;
+}
diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c
index 8cdbd8634a5..915412e4d5b 100644
--- a/arch/powerpc/mm/icswx.c
+++ b/arch/powerpc/mm/icswx.c
@@ -67,7 +67,7 @@
void switch_cop(struct mm_struct *next)
{
-#ifdef CONFIG_ICSWX_PID
+#ifdef CONFIG_PPC_ICSWX_PID
mtspr(SPRN_PID, next->context.cop_pid);
#endif
mtspr(SPRN_ACOP, next->context.acop);
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 01e2db97a21..cff59f1bec2 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -52,7 +52,7 @@
#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET))
-#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
+#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_KERNEL_START"
#endif
#endif
#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE
@@ -213,7 +213,12 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
*/
BUG_ON(first_memblock_base != 0);
+#ifdef CONFIG_PIN_TLB
+ /* 8xx can only access 24MB at the moment */
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000));
+#else
/* 8xx can only access 8MB at the moment */
memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
+#endif
}
#endif /* CONFIG_8xx */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 95a45293e5a..e3734edffa6 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -88,7 +88,11 @@ static void pgd_ctor(void *addr)
static void pmd_ctor(void *addr)
{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ memset(addr, 0, PMD_TABLE_SIZE * 2);
+#else
memset(addr, 0, PMD_TABLE_SIZE);
+#endif
}
struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -129,8 +133,7 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
align = max_t(unsigned long, align, minalign);
name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
new = kmem_cache_create(name, table_size, align, 0, ctor);
- PGT_CACHE(shift) = new;
-
+ pgtable_cache[shift - 1] = new;
pr_debug("Allocated pgtable cache for order %d\n", shift);
}
@@ -138,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
void pgtable_cache_init(void)
{
pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
- pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
- if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+ pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+ if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
panic("Couldn't allocate pgtable caches");
-
/* In all current configs, when the PUD index exists it's the
* same size as either the pgd or pmd index. Verify that the
* initialization above has also created a PUD cache. This
@@ -216,7 +218,8 @@ static void __meminit vmemmap_create_mapping(unsigned long start,
unsigned long phys)
{
int mapped = htab_bolt_mapping(start, start + page_size, phys,
- PAGE_KERNEL, mmu_vmemmap_psize,
+ pgprot_val(PAGE_KERNEL),
+ mmu_vmemmap_psize,
mmu_kernel_ssize);
BUG_ON(mapped < 0);
}
@@ -263,19 +266,14 @@ static __meminit void vmemmap_list_populate(unsigned long phys,
vmemmap_list = vmem_back;
}
-int __meminit vmemmap_populate(struct page *start_page,
- unsigned long nr_pages, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
- unsigned long start = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + nr_pages);
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
/* Align to the page size of the linear mapping. */
start = _ALIGN_DOWN(start, page_size);
- pr_debug("vmemmap_populate page %p, %ld pages, node %d\n",
- start_page, nr_pages, node);
- pr_debug(" -> map %lx..%lx\n", start, end);
+ pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
for (; start < end; start += page_size) {
void *p;
@@ -297,5 +295,63 @@ int __meminit vmemmap_populate(struct page *start_page,
return 0;
}
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+void vmemmap_free(unsigned long start, unsigned long end)
+{
+}
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+}
+
+/*
+ * We do not have access to the sparsemem vmemmap, so we fallback to
+ * walking the list of sparsemem blocks which we already maintain for
+ * the sake of crashdump. In the long run, we might want to maintain
+ * a tree if performance of that linear walk becomes a problem.
+ *
+ * realmode_pfn_to_page functions can fail due to:
+ * 1) As real sparsemem blocks do not lay in RAM continously (they
+ * are in virtual address space which is not available in the real mode),
+ * the requested page struct can be split between blocks so get_page/put_page
+ * may fail.
+ * 2) When huge pages are used, the get_page/put_page API will fail
+ * in real mode as the linked addresses in the page struct are virtual
+ * too.
+ */
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+ struct vmemmap_backing *vmem_back;
+ struct page *page;
+ unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
+ unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
+
+ for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
+ if (pg_va < vmem_back->virt_addr)
+ continue;
+
+ /* Check that page struct is not split between real pages */
+ if ((pg_va + sizeof(struct page)) >
+ (vmem_back->virt_addr + page_size))
+ return NULL;
+
+ page = (struct page *) (vmem_back->phys + pg_va -
+ vmem_back->virt_addr);
+ return page;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#elif defined(CONFIG_FLATMEM)
+
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+ return page;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0dba5066c22..2c8e90f5789 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -66,10 +66,9 @@ unsigned long long memory_limit;
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
+EXPORT_SYMBOL(kmap_pte);
pgprot_t kmap_prot;
-
EXPORT_SYMBOL(kmap_prot);
-EXPORT_SYMBOL(kmap_pte);
static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
@@ -133,6 +132,23 @@ int arch_add_memory(int nid, u64 start, u64 size)
return __add_pages(nid, zone, start_pfn, nr_pages);
}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+ int ret;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ ret = __remove_pages(zone, start_pfn, nr_pages);
+ if (!ret && (ppc_md.remove_memory))
+ ret = ppc_md.remove_memory(start, size);
+
+ return ret;
+}
+#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
/*
@@ -195,13 +211,10 @@ void __init do_init_bootmem(void)
min_low_pfn = MEMORY_START >> PAGE_SHIFT;
boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn);
- /* Add active regions with valid PFNs */
- for_each_memblock(memory, reg) {
- unsigned long start_pfn, end_pfn;
- start_pfn = memblock_region_memory_base_pfn(reg);
- end_pfn = memblock_region_memory_end_pfn(reg);
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
- }
+ /* Place all memblock_regions in the same node and merge contiguous
+ * memblock_regions
+ */
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
/* Add all physical memory to the bootmem map, mark each area
* present.
@@ -289,49 +302,30 @@ void __init paging_init(void)
}
#endif /* ! CONFIG_NEED_MULTIPLE_NODES */
+static void __init register_page_bootmem_info(void)
+{
+ int i;
+
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+}
+
void __init mem_init(void)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- int nid;
-#endif
- pg_data_t *pgdat;
- unsigned long i;
- struct page *page;
- unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
+ /*
+ * book3s is limited to 16 page sizes due to encoding this in
+ * a 4-bit field for slices.
+ */
+ BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
#ifdef CONFIG_SWIOTLB
swiotlb_init(0);
#endif
- num_physpages = memblock_phys_mem_size() >> PAGE_SHIFT;
+ register_page_bootmem_info();
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
-
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- for_each_online_node(nid) {
- if (NODE_DATA(nid)->node_spanned_pages != 0) {
- printk("freeing bootmem node %d\n", nid);
- totalram_pages +=
- free_all_bootmem_node(NODE_DATA(nid));
- }
- }
-#else
- max_mapnr = max_pfn;
- totalram_pages += free_all_bootmem();
-#endif
- for_each_online_pgdat(pgdat) {
- for (i = 0; i < pgdat->node_spanned_pages; i++) {
- if (!pfn_valid(pgdat->node_start_pfn + i))
- continue;
- page = pgdat_page_nr(pgdat, i);
- if (PageReserved(page))
- reservedpages++;
- }
- }
-
- codesize = (unsigned long)&_sdata - (unsigned long)&_stext;
- datasize = (unsigned long)&_edata - (unsigned long)&_sdata;
- initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
- bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
+ set_max_mapnr(max_pfn);
+ free_all_bootmem();
#ifdef CONFIG_HIGHMEM
{
@@ -341,17 +335,9 @@ void __init mem_init(void)
for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
- if (memblock_is_reserved(paddr))
- continue;
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
- reservedpages--;
+ if (!memblock_is_reserved(paddr))
+ free_highmem_page(page);
}
- totalram_pages += totalhigh_pages;
- printk(KERN_DEBUG "High memory: %luk\n",
- totalhigh_pages << (PAGE_SHIFT-10));
}
#endif /* CONFIG_HIGHMEM */
@@ -364,16 +350,7 @@ void __init mem_init(void)
(mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
#endif
- printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
- "%luk reserved, %luk data, %luk bss, %luk init)\n",
- nr_free_pages() << (PAGE_SHIFT-10),
- num_physpages << (PAGE_SHIFT-10),
- codesize >> 10,
- reservedpages << (PAGE_SHIFT-10),
- datasize >> 10,
- bsssize >> 10,
- initsize >> 10);
-
+ mem_init_print_info(NULL);
#ifdef CONFIG_PPC32
pr_info("Kernel virtual memory layout:\n");
pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP);
@@ -396,39 +373,14 @@ void __init mem_init(void)
void free_initmem(void)
{
- unsigned long addr;
-
ppc_md.progress = ppc_printk_progress;
-
- addr = (unsigned long)__init_begin;
- for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- free_page(addr);
- totalram_pages++;
- }
- pr_info("Freeing unused kernel memory: %luk freed\n",
- ((unsigned long)__init_end -
- (unsigned long)__init_begin) >> 10);
+ free_initmem_default(POISON_FREE_INITMEM);
}
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start >= end)
- return;
-
- start = _ALIGN_DOWN(start, PAGE_SIZE);
- end = _ALIGN_UP(end, PAGE_SIZE);
- pr_info("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
-
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
+ free_reserved_area((void *)start, (void *)end, -1, "initrd");
}
#endif
@@ -529,6 +481,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
#ifdef CONFIG_PPC_STD_MMU
+ /*
+ * We don't need to worry about _PAGE_PRESENT here because we are
+ * called with either mm->page_table_lock held or ptl lock held
+ */
unsigned long access = 0, trap;
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
@@ -562,7 +518,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
* System memory should not be in /proc/iomem but various tools expect it
* (eg kdump).
*/
-static int add_system_ram_resources(void)
+static int __init add_system_ram_resources(void)
{
struct memblock_region *reg;
@@ -578,7 +534,7 @@ static int add_system_ram_resources(void)
res->name = "System RAM";
res->start = base;
res->end = base + size - 1;
- res->flags = IORESOURCE_MEM;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
WARN_ON(request_resource(&iomem_resource, res) < 0);
}
}
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap.c
index 67a42ed0d2f..cb8bdbe4972 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap.c
@@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index 40bc5b0ace5..178876aef40 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -23,21 +23,13 @@
#include <linux/slab.h>
#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
#include "icswx.h"
static DEFINE_SPINLOCK(mmu_context_lock);
static DEFINE_IDA(mmu_context_ida);
-/*
- * 256MB segment
- * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments
- * available for user mappings. Each segment contains 2^28 bytes. Each
- * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts
- * (19 == 37 + 28 - 46).
- */
-#define MAX_CONTEXT ((1UL << CONTEXT_BITS) - 1)
-
int __init_new_context(void)
{
int index;
@@ -56,7 +48,7 @@ again:
else if (err)
return err;
- if (index > MAX_CONTEXT) {
+ if (index > MAX_USER_CONTEXT) {
spin_lock(&mmu_context_lock);
ida_remove(&mmu_context_ida, index);
spin_unlock(&mmu_context_lock);
@@ -94,6 +86,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
spin_lock_init(mm->context.cop_lockp);
#endif /* CONFIG_PPC_ICSWX */
+#ifdef CONFIG_PPC_64K_PAGES
+ mm->context.pte_frag = NULL;
+#endif
return 0;
}
@@ -105,13 +100,46 @@ void __destroy_context(int context_id)
}
EXPORT_SYMBOL_GPL(__destroy_context);
+#ifdef CONFIG_PPC_64K_PAGES
+static void destroy_pagetable_page(struct mm_struct *mm)
+{
+ int count;
+ void *pte_frag;
+ struct page *page;
+
+ pte_frag = mm->context.pte_frag;
+ if (!pte_frag)
+ return;
+
+ page = virt_to_page(pte_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
+ if (!count) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+#else
+static inline void destroy_pagetable_page(struct mm_struct *mm)
+{
+ return;
+}
+#endif
+
+
void destroy_context(struct mm_struct *mm)
{
+
#ifdef CONFIG_PPC_ICSWX
drop_cop(mm->context.acop, mm);
kfree(mm->context.cop_lockp);
mm->context.cop_lockp = NULL;
#endif /* CONFIG_PPC_ICSWX */
+
+ destroy_pagetable_page(mm);
__destroy_context(mm->context.id);
subpage_prot_free(mm);
mm->context.id = MMU_NO_CONTEXT;
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index e779642c25e..928ebe79668 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id)
*/
for_each_cpu(cpu, mm_cpumask(mm)) {
for (i = cpu_first_thread_sibling(cpu);
- i <= cpu_last_thread_sibling(cpu); i++)
- __set_bit(id, stale_map[i]);
+ i <= cpu_last_thread_sibling(cpu); i++) {
+ if (stale_map[i])
+ __set_bit(id, stale_map[i]);
+ }
cpu = i - 1;
}
return id;
@@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
/* XXX This clear should ultimately be part of local_flush_tlb_mm */
for (i = cpu_first_thread_sibling(cpu);
i <= cpu_last_thread_sibling(cpu); i++) {
- __clear_bit(id, stale_map[i]);
+ if (stale_map[i])
+ __clear_bit(id, stale_map[i]);
}
}
@@ -329,8 +332,8 @@ void destroy_context(struct mm_struct *mm)
#ifdef CONFIG_SMP
-static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int mmu_context_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned int)(long)hcpu;
@@ -363,7 +366,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata mmu_context_cpu_nb = {
+static struct notifier_block mmu_context_cpu_nb = {
.notifier_call = mmu_context_cpu_notify,
};
@@ -407,17 +410,7 @@ void __init mmu_context_init(void)
} else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
first_context = 1;
last_context = 65535;
- } else
-#ifdef CONFIG_PPC_BOOK3E_MMU
- if (mmu_has_feature(MMU_FTR_TYPE_3E)) {
- u32 mmucfg = mfspr(SPRN_MMUCFG);
- u32 pid_bits = (mmucfg & MMUCFG_PIDSIZE_MASK)
- >> MMUCFG_PIDSIZE_SHIFT;
- first_context = 1;
- last_context = (1UL << (pid_bits + 1)) - 1;
- } else
-#endif
- {
+ } else {
first_context = 1;
last_context = 255;
}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 83eb5d5f53d..9615d82919b 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -148,6 +148,8 @@ extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
extern void MMU_init_hw(void);
extern unsigned long mmu_mapin_ram(unsigned long top);
extern void adjust_total_lowmem(void);
+extern int switch_to_as1(void);
+extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
#endif
extern void loadcam_entry(unsigned int index);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index bba87ca2b4d..3b181b22cd4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -22,13 +22,22 @@
#include <linux/pfn.h>
#include <linux/cpuset.h>
#include <linux/node.h>
+#include <linux/stop_machine.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <asm/cputhreads.h>
#include <asm/sparsemem.h>
#include <asm/prom.h>
#include <asm/smp.h>
+#include <asm/cputhreads.h>
+#include <asm/topology.h>
#include <asm/firmware.h>
#include <asm/paca.h>
#include <asm/hvcall.h>
#include <asm/setup.h>
+#include <asm/vdso.h>
static int numa_enabled = 1;
@@ -51,7 +60,7 @@ static int form1_affinity;
#define MAX_DISTANCE_REF_POINTS 4
static int distance_ref_points_depth;
-static const unsigned int *distance_ref_points;
+static const __be32 *distance_ref_points;
static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
/*
@@ -62,14 +71,11 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
*/
static void __init setup_node_to_cpumask_map(void)
{
- unsigned int node, num = 0;
+ unsigned int node;
/* setup nr_node_ids if not done yet */
- if (nr_node_ids == MAX_NUMNODES) {
- for_each_node_mask(node, node_possible_map)
- num = node;
- nr_node_ids = num + 1;
- }
+ if (nr_node_ids == MAX_NUMNODES)
+ setup_nr_node_ids();
/* allocate the map */
for (node = 0; node < nr_node_ids; node++)
@@ -79,7 +85,7 @@ static void __init setup_node_to_cpumask_map(void)
dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
}
-static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
+static int __init fake_numa_create_new_node(unsigned long end_pfn,
unsigned int *nid)
{
unsigned long long mem;
@@ -148,9 +154,22 @@ static void __init get_node_active_region(unsigned long pfn,
}
}
-static void map_cpu_to_node(int cpu, int node)
+static void reset_numa_cpu_lookup_table(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ numa_cpu_lookup_table[cpu] = -1;
+}
+
+static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
{
numa_cpu_lookup_table[cpu] = node;
+}
+
+static void map_cpu_to_node(int cpu, int node)
+{
+ update_numa_cpu_lookup_table(cpu, node);
dbg("adding cpu %d to node %d\n", cpu, node);
@@ -175,7 +194,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
/* must hold reference to node during call */
-static const int *of_get_associativity(struct device_node *dev)
+static const __be32 *of_get_associativity(struct device_node *dev)
{
return of_get_property(dev, "ibm,associativity", NULL);
}
@@ -185,13 +204,13 @@ static const int *of_get_associativity(struct device_node *dev)
* it exists (the property exists only in kexec/kdump kernels,
* added by kexec-tools)
*/
-static const u32 *of_get_usable_memory(struct device_node *memory)
+static const __be32 *of_get_usable_memory(struct device_node *memory)
{
- const u32 *prop;
+ const __be32 *prop;
u32 len;
prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
if (!prop || len < sizeof(unsigned int))
- return 0;
+ return NULL;
return prop;
}
@@ -201,7 +220,7 @@ int __node_distance(int a, int b)
int distance = LOCAL_DISTANCE;
if (!form1_affinity)
- return distance;
+ return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
for (i = 0; i < distance_ref_points_depth; i++) {
if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
@@ -213,9 +232,10 @@ int __node_distance(int a, int b)
return distance;
}
+EXPORT_SYMBOL(__node_distance);
static void initialize_distance_lookup_table(int nid,
- const unsigned int *associativity)
+ const __be32 *associativity)
{
int i;
@@ -223,29 +243,32 @@ static void initialize_distance_lookup_table(int nid,
return;
for (i = 0; i < distance_ref_points_depth; i++) {
- distance_lookup_table[nid][i] =
- associativity[distance_ref_points[i]];
+ const __be32 *entry;
+
+ entry = &associativity[be32_to_cpu(distance_ref_points[i])];
+ distance_lookup_table[nid][i] = of_read_number(entry, 1);
}
}
/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
* info is found.
*/
-static int associativity_to_nid(const unsigned int *associativity)
+static int associativity_to_nid(const __be32 *associativity)
{
int nid = -1;
if (min_common_depth == -1)
goto out;
- if (associativity[0] >= min_common_depth)
- nid = associativity[min_common_depth];
+ if (of_read_number(associativity, 1) >= min_common_depth)
+ nid = of_read_number(&associativity[min_common_depth], 1);
/* POWER4 LPAR uses 0xffff as invalid node */
if (nid == 0xffff || nid >= MAX_NUMNODES)
nid = -1;
- if (nid > 0 && associativity[0] >= distance_ref_points_depth)
+ if (nid > 0 &&
+ of_read_number(associativity, 1) >= distance_ref_points_depth)
initialize_distance_lookup_table(nid, associativity);
out:
@@ -258,7 +281,7 @@ out:
static int of_node_to_nid_single(struct device_node *device)
{
int nid = -1;
- const unsigned int *tmp;
+ const __be32 *tmp;
tmp = of_get_associativity(device);
if (tmp)
@@ -291,9 +314,7 @@ EXPORT_SYMBOL_GPL(of_node_to_nid);
static int __init find_min_common_depth(void)
{
int depth;
- struct device_node *chosen;
struct device_node *root;
- const char *vec5;
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
@@ -325,28 +346,14 @@ static int __init find_min_common_depth(void)
distance_ref_points_depth /= sizeof(int);
-#define VEC5_AFFINITY_BYTE 5
-#define VEC5_AFFINITY 0x80
-
- if (firmware_has_feature(FW_FEATURE_OPAL))
+ if (firmware_has_feature(FW_FEATURE_OPAL) ||
+ firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
+ dbg("Using form 1 affinity\n");
form1_affinity = 1;
- else {
- chosen = of_find_node_by_path("/chosen");
- if (chosen) {
- vec5 = of_get_property(chosen,
- "ibm,architecture-vec-5", NULL);
- if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
- VEC5_AFFINITY)) {
- dbg("Using form 1 affinity\n");
- form1_affinity = 1;
- }
-
- of_node_put(chosen);
- }
}
if (form1_affinity) {
- depth = distance_ref_points[0];
+ depth = of_read_number(distance_ref_points, 1);
} else {
if (distance_ref_points_depth < 2) {
printk(KERN_WARNING "NUMA: "
@@ -354,7 +361,7 @@ static int __init find_min_common_depth(void)
goto err;
}
- depth = distance_ref_points[1];
+ depth = of_read_number(&distance_ref_points[1], 1);
}
/*
@@ -388,12 +395,12 @@ static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
of_node_put(memory);
}
-static unsigned long read_n_cells(int n, const unsigned int **buf)
+static unsigned long read_n_cells(int n, const __be32 **buf)
{
unsigned long result = 0;
while (n--) {
- result = (result << 32) | **buf;
+ result = (result << 32) | of_read_number(*buf, 1);
(*buf)++;
}
return result;
@@ -403,17 +410,17 @@ static unsigned long read_n_cells(int n, const unsigned int **buf)
* Read the next memblock list entry from the ibm,dynamic-memory property
* and return the information in the provided of_drconf_cell structure.
*/
-static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
+static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp)
{
- const u32 *cp;
+ const __be32 *cp;
drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
cp = *cellp;
- drmem->drc_index = cp[0];
- drmem->reserved = cp[1];
- drmem->aa_index = cp[2];
- drmem->flags = cp[3];
+ drmem->drc_index = of_read_number(cp, 1);
+ drmem->reserved = of_read_number(&cp[1], 1);
+ drmem->aa_index = of_read_number(&cp[2], 1);
+ drmem->flags = of_read_number(&cp[3], 1);
*cellp = cp + 4;
}
@@ -425,16 +432,16 @@ static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
* list entries followed by N memblock list entries. Each memblock list entry
* contains information as laid out in the of_drconf_cell struct above.
*/
-static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
+static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm)
{
- const u32 *prop;
+ const __be32 *prop;
u32 len, entries;
prop = of_get_property(memory, "ibm,dynamic-memory", &len);
if (!prop || len < sizeof(unsigned int))
return 0;
- entries = *prop++;
+ entries = of_read_number(prop++, 1);
/* Now that we know the number of entries, revalidate the size
* of the property read in to ensure we have everything
@@ -452,7 +459,7 @@ static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
*/
static u64 of_get_lmb_size(struct device_node *memory)
{
- const u32 *prop;
+ const __be32 *prop;
u32 len;
prop = of_get_property(memory, "ibm,lmb-size", &len);
@@ -465,7 +472,7 @@ static u64 of_get_lmb_size(struct device_node *memory)
struct assoc_arrays {
u32 n_arrays;
u32 array_sz;
- const u32 *arrays;
+ const __be32 *arrays;
};
/*
@@ -481,15 +488,15 @@ struct assoc_arrays {
static int of_get_assoc_arrays(struct device_node *memory,
struct assoc_arrays *aa)
{
- const u32 *prop;
+ const __be32 *prop;
u32 len;
prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
if (!prop || len < 2 * sizeof(unsigned int))
return -1;
- aa->n_arrays = *prop++;
- aa->array_sz = *prop++;
+ aa->n_arrays = of_read_number(prop++, 1);
+ aa->array_sz = of_read_number(prop++, 1);
/* Now that we know the number of arrays and size of each array,
* revalidate the size of the property read in.
@@ -516,7 +523,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
!(drmem->flags & DRCONF_MEM_AI_INVALID) &&
drmem->aa_index < aa->n_arrays) {
index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
- nid = aa->arrays[index];
+ nid = of_read_number(&aa->arrays[index], 1);
if (nid == 0xffff || nid >= MAX_NUMNODES)
nid = default_nid;
@@ -529,13 +536,26 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
* Figure out to which domain a cpu belongs and stick it there.
* Return the id of the domain used.
*/
-static int __cpuinit numa_setup_cpu(unsigned long lcpu)
+static int numa_setup_cpu(unsigned long lcpu)
{
- int nid = 0;
- struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
+ int nid;
+ struct device_node *cpu;
+
+ /*
+ * If a valid cpu-to-node mapping is already available, use it
+ * directly instead of querying the firmware, since it represents
+ * the most recent mapping notified to us by the platform (eg: VPHN).
+ */
+ if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
+ map_cpu_to_node(lcpu, nid);
+ return nid;
+ }
+
+ cpu = of_get_cpu_node(lcpu, NULL);
if (!cpu) {
WARN_ON(1);
+ nid = 0;
goto out;
}
@@ -551,17 +571,38 @@ out:
return nid;
}
-static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
- unsigned long action,
+static void verify_cpu_node_mapping(int cpu, int node)
+{
+ int base, sibling, i;
+
+ /* Verify that all the threads in the core belong to the same node */
+ base = cpu_first_thread_sibling(cpu);
+
+ for (i = 0; i < threads_per_core; i++) {
+ sibling = base + i;
+
+ if (sibling == cpu || cpu_is_offline(sibling))
+ continue;
+
+ if (cpu_to_node(sibling) != node) {
+ WARN(1, "CPU thread siblings %d and %d don't belong"
+ " to the same node!\n", cpu, sibling);
+ break;
+ }
+ }
+}
+
+static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
unsigned long lcpu = (unsigned long)hcpu;
- int ret = NOTIFY_DONE;
+ int ret = NOTIFY_DONE, nid;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- numa_setup_cpu(lcpu);
+ nid = numa_setup_cpu(lcpu);
+ verify_cpu_node_mapping((int)lcpu, nid);
ret = NOTIFY_OK;
break;
#ifdef CONFIG_HOTPLUG_CPU
@@ -608,7 +649,7 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start,
* Reads the counter for a given entry in
* linux,drconf-usable-memory property
*/
-static inline int __init read_usm_ranges(const u32 **usm)
+static inline int __init read_usm_ranges(const __be32 **usm)
{
/*
* For each lmb in ibm,dynamic-memory a corresponding
@@ -625,7 +666,7 @@ static inline int __init read_usm_ranges(const u32 **usm)
*/
static void __init parse_drconf_memory(struct device_node *memory)
{
- const u32 *uninitialized_var(dm), *usm;
+ const __be32 *uninitialized_var(dm), *usm;
unsigned int n, rc, ranges, is_kexec_kdump = 0;
unsigned long lmb_size, base, size, sz;
int nid;
@@ -680,7 +721,8 @@ static void __init parse_drconf_memory(struct device_node *memory)
node_set_online(nid);
sz = numa_enforce_memory_limit(base, size);
if (sz)
- memblock_set_node(base, sz, nid);
+ memblock_set_node(base, sz,
+ &memblock.memory, nid);
} while (--ranges);
}
}
@@ -734,7 +776,7 @@ static int __init parse_numa_properties(void)
unsigned long size;
int nid;
int ranges;
- const unsigned int *memcell_buf;
+ const __be32 *memcell_buf;
unsigned int len;
memcell_buf = of_get_property(memory,
@@ -770,7 +812,7 @@ new_range:
continue;
}
- memblock_set_node(start, size, nid);
+ memblock_set_node(start, size, &memblock.memory, nid);
if (--ranges)
goto new_range;
@@ -807,7 +849,8 @@ static void __init setup_nonnuma(void)
fake_numa_create_new_node(end_pfn, &nid);
memblock_set_node(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn), nid);
+ PFN_PHYS(end_pfn - start_pfn),
+ &memblock.memory, nid);
node_set_online(nid);
}
}
@@ -932,7 +975,7 @@ static void __init *careful_zallocation(int nid, unsigned long size,
return ret;
}
-static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+static struct notifier_block ppc64_numa_nb = {
.notifier_call = cpu_numa_callback,
.priority = 1 /* Must run before sched domains notifier. */
};
@@ -948,8 +991,7 @@ static void __init mark_reserved_regions_for_nid(int nid)
unsigned long start_pfn = physbase >> PAGE_SHIFT;
unsigned long end_pfn = PFN_UP(physbase + size);
struct node_active_region node_ar;
- unsigned long node_end_pfn = node->node_start_pfn +
- node->node_spanned_pages;
+ unsigned long node_end_pfn = pgdat_end_pfn(node);
/*
* Check to make sure that this memblock.reserved area is
@@ -1078,6 +1120,7 @@ void __init do_init_bootmem(void)
*/
setup_node_to_cpumask_map();
+ reset_numa_cpu_lookup_table();
register_cpu_notifier(&ppc64_numa_nb);
cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
(void *)(unsigned long)boot_cpuid);
@@ -1119,7 +1162,7 @@ early_param("numa", early_numa);
static int hot_add_drconf_scn_to_nid(struct device_node *memory,
unsigned long scn_addr)
{
- const u32 *dm;
+ const __be32 *dm;
unsigned int drconf_cell_cnt, rc;
unsigned long lmb_size;
struct assoc_arrays aa;
@@ -1164,7 +1207,7 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory,
* represented in the device tree as a node (i.e. memory@XXXX) for
* each memblock.
*/
-int hot_add_node_scn_to_nid(unsigned long scn_addr)
+static int hot_add_node_scn_to_nid(unsigned long scn_addr)
{
struct device_node *memory;
int nid = -1;
@@ -1172,7 +1215,7 @@ int hot_add_node_scn_to_nid(unsigned long scn_addr)
for_each_node_by_type(memory, "memory") {
unsigned long start, size;
int ranges;
- const unsigned int *memcell_buf;
+ const __be32 *memcell_buf;
unsigned int len;
memcell_buf = of_get_property(memory, "reg", &len);
@@ -1245,7 +1288,7 @@ static u64 hot_add_drconf_memory_max(void)
struct device_node *memory = NULL;
unsigned int drconf_cell_cnt = 0;
u64 lmb_size = 0;
- const u32 *dm = 0;
+ const __be32 *dm = NULL;
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory) {
@@ -1270,10 +1313,18 @@ u64 memory_hotplug_max(void)
/* Virtual Processor Home Node (VPHN) support */
#ifdef CONFIG_PPC_SPLPAR
+struct topology_update_data {
+ struct topology_update_data *next;
+ unsigned int cpu;
+ int old_nid;
+ int new_nid;
+};
+
static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
static cpumask_t cpu_associativity_changes_mask;
static int vphn_enabled;
-static void set_topology_timer(void);
+static int prrn_enabled;
+static void reset_topology_timer(void);
/*
* Store the current values of the associativity change counters in the
@@ -1309,11 +1360,9 @@ static void setup_cpu_associativity_change_counters(void)
*/
static int update_cpu_associativity_changes_mask(void)
{
- int cpu, nr_cpus = 0;
+ int cpu;
cpumask_t *changes = &cpu_associativity_changes_mask;
- cpumask_clear(changes);
-
for_each_possible_cpu(cpu) {
int i, changed = 0;
u8 *counts = vphn_cpu_change_counts[cpu];
@@ -1326,12 +1375,12 @@ static int update_cpu_associativity_changes_mask(void)
}
}
if (changed) {
- cpumask_set_cpu(cpu, changes);
- nr_cpus++;
+ cpumask_or(changes, changes, cpu_sibling_mask(cpu));
+ cpu = cpu_last_thread_sibling(cpu);
}
}
- return nr_cpus;
+ return cpumask_weight(changes);
}
/*
@@ -1344,40 +1393,41 @@ static int update_cpu_associativity_changes_mask(void)
* Convert the associativity domain numbers returned from the hypervisor
* to the sequence they would appear in the ibm,associativity property.
*/
-static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
+static int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
{
int i, nr_assoc_doms = 0;
- const u16 *field = (const u16*) packed;
+ const __be16 *field = (const __be16 *) packed;
#define VPHN_FIELD_UNUSED (0xffff)
#define VPHN_FIELD_MSB (0x8000)
#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
- if (*field == VPHN_FIELD_UNUSED) {
+ if (be16_to_cpup(field) == VPHN_FIELD_UNUSED) {
/* All significant fields processed, and remaining
* fields contain the reserved value of all 1's.
* Just store them.
*/
- unpacked[i] = *((u32*)field);
+ unpacked[i] = *((__be32 *)field);
field += 2;
- } else if (*field & VPHN_FIELD_MSB) {
+ } else if (be16_to_cpup(field) & VPHN_FIELD_MSB) {
/* Data is in the lower 15 bits of this field */
- unpacked[i] = *field & VPHN_FIELD_MASK;
+ unpacked[i] = cpu_to_be32(
+ be16_to_cpup(field) & VPHN_FIELD_MASK);
field++;
nr_assoc_doms++;
} else {
/* Data is in the lower 15 bits of this field
* concatenated with the next 16 bit field
*/
- unpacked[i] = *((u32*)field);
+ unpacked[i] = *((__be32 *)field);
field += 2;
nr_assoc_doms++;
}
}
/* The first cell contains the length of the property */
- unpacked[0] = nr_assoc_doms;
+ unpacked[0] = cpu_to_be32(nr_assoc_doms);
return nr_assoc_doms;
}
@@ -1386,7 +1436,7 @@ static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
* Retrieve the new associativity information for a virtual processor's
* home node.
*/
-static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
+static long hcall_vphn(unsigned long cpu, __be32 *associativity)
{
long rc;
long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
@@ -1400,7 +1450,7 @@ static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
}
static long vphn_get_associativity(unsigned long cpu,
- unsigned int *associativity)
+ __be32 *associativity)
{
long rc;
@@ -1423,40 +1473,162 @@ static long vphn_get_associativity(unsigned long cpu,
}
/*
+ * Update the CPU maps and sysfs entries for a single CPU when its NUMA
+ * characteristics change. This function doesn't perform any locking and is
+ * only safe to call from stop_machine().
+ */
+static int update_cpu_topology(void *data)
+{
+ struct topology_update_data *update;
+ unsigned long cpu;
+
+ if (!data)
+ return -EINVAL;
+
+ cpu = smp_processor_id();
+
+ for (update = data; update; update = update->next) {
+ if (cpu != update->cpu)
+ continue;
+
+ unmap_cpu_from_node(update->cpu);
+ map_cpu_to_node(update->cpu, update->new_nid);
+ vdso_getcpu_init();
+ }
+
+ return 0;
+}
+
+static int update_lookup_table(void *data)
+{
+ struct topology_update_data *update;
+
+ if (!data)
+ return -EINVAL;
+
+ /*
+ * Upon topology update, the numa-cpu lookup table needs to be updated
+ * for all threads in the core, including offline CPUs, to ensure that
+ * future hotplug operations respect the cpu-to-node associativity
+ * properly.
+ */
+ for (update = data; update; update = update->next) {
+ int nid, base, j;
+
+ nid = update->new_nid;
+ base = cpu_first_thread_sibling(update->cpu);
+
+ for (j = 0; j < threads_per_core; j++) {
+ update_numa_cpu_lookup_table(base + j, nid);
+ }
+ }
+
+ return 0;
+}
+
+/*
* Update the node maps and sysfs entries for each cpu whose home node
* has changed. Returns 1 when the topology has changed, and 0 otherwise.
*/
int arch_update_cpu_topology(void)
{
- int cpu, nid, old_nid, changed = 0;
- unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ unsigned int cpu, sibling, changed = 0;
+ struct topology_update_data *updates, *ud;
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ cpumask_t updated_cpus;
struct device *dev;
+ int weight, new_nid, i = 0;
- for_each_cpu(cpu,&cpu_associativity_changes_mask) {
- vphn_get_associativity(cpu, associativity);
- nid = associativity_to_nid(associativity);
+ weight = cpumask_weight(&cpu_associativity_changes_mask);
+ if (!weight)
+ return 0;
- if (nid < 0 || !node_online(nid))
- nid = first_online_node;
+ updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
+ if (!updates)
+ return 0;
- old_nid = numa_cpu_lookup_table[cpu];
+ cpumask_clear(&updated_cpus);
- /* Disable hotplug while we update the cpu
- * masks and sysfs.
+ for_each_cpu(cpu, &cpu_associativity_changes_mask) {
+ /*
+ * If siblings aren't flagged for changes, updates list
+ * will be too short. Skip on this update and set for next
+ * update.
*/
- get_online_cpus();
- unregister_cpu_under_node(cpu, old_nid);
- unmap_cpu_from_node(cpu);
- map_cpu_to_node(cpu, nid);
- register_cpu_under_node(cpu, nid);
- put_online_cpus();
-
- dev = get_cpu_device(cpu);
+ if (!cpumask_subset(cpu_sibling_mask(cpu),
+ &cpu_associativity_changes_mask)) {
+ pr_info("Sibling bits not set for associativity "
+ "change, cpu%d\n", cpu);
+ cpumask_or(&cpu_associativity_changes_mask,
+ &cpu_associativity_changes_mask,
+ cpu_sibling_mask(cpu));
+ cpu = cpu_last_thread_sibling(cpu);
+ continue;
+ }
+
+ /* Use associativity from first thread for all siblings */
+ vphn_get_associativity(cpu, associativity);
+ new_nid = associativity_to_nid(associativity);
+ if (new_nid < 0 || !node_online(new_nid))
+ new_nid = first_online_node;
+
+ if (new_nid == numa_cpu_lookup_table[cpu]) {
+ cpumask_andnot(&cpu_associativity_changes_mask,
+ &cpu_associativity_changes_mask,
+ cpu_sibling_mask(cpu));
+ cpu = cpu_last_thread_sibling(cpu);
+ continue;
+ }
+
+ for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
+ ud = &updates[i++];
+ ud->cpu = sibling;
+ ud->new_nid = new_nid;
+ ud->old_nid = numa_cpu_lookup_table[sibling];
+ cpumask_set_cpu(sibling, &updated_cpus);
+ if (i < weight)
+ ud->next = &updates[i];
+ }
+ cpu = cpu_last_thread_sibling(cpu);
+ }
+
+ /*
+ * In cases where we have nothing to update (because the updates list
+ * is too short or because the new topology is same as the old one),
+ * skip invoking update_cpu_topology() via stop-machine(). This is
+ * necessary (and not just a fast-path optimization) since stop-machine
+ * can end up electing a random CPU to run update_cpu_topology(), and
+ * thus trick us into setting up incorrect cpu-node mappings (since
+ * 'updates' is kzalloc()'ed).
+ *
+ * And for the similar reason, we will skip all the following updating.
+ */
+ if (!cpumask_weight(&updated_cpus))
+ goto out;
+
+ stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
+
+ /*
+ * Update the numa-cpu lookup table with the new mappings, even for
+ * offline CPUs. It is best to perform this update from the stop-
+ * machine context.
+ */
+ stop_machine(update_lookup_table, &updates[0],
+ cpumask_of(raw_smp_processor_id()));
+
+ for (ud = &updates[0]; ud; ud = ud->next) {
+ unregister_cpu_under_node(ud->cpu, ud->old_nid);
+ register_cpu_under_node(ud->cpu, ud->new_nid);
+
+ dev = get_cpu_device(ud->cpu);
if (dev)
kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+ cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
changed = 1;
}
+out:
+ kfree(updates);
return changed;
}
@@ -1466,56 +1638,172 @@ static void topology_work_fn(struct work_struct *work)
}
static DECLARE_WORK(topology_work, topology_work_fn);
-void topology_schedule_update(void)
+static void topology_schedule_update(void)
{
schedule_work(&topology_work);
}
static void topology_timer_fn(unsigned long ignored)
{
- if (!vphn_enabled)
- return;
- if (update_cpu_associativity_changes_mask() > 0)
+ if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
topology_schedule_update();
- set_topology_timer();
+ else if (vphn_enabled) {
+ if (update_cpu_associativity_changes_mask() > 0)
+ topology_schedule_update();
+ reset_topology_timer();
+ }
}
static struct timer_list topology_timer =
TIMER_INITIALIZER(topology_timer_fn, 0, 0);
-static void set_topology_timer(void)
+static void reset_topology_timer(void)
{
topology_timer.data = 0;
topology_timer.expires = jiffies + 60 * HZ;
- add_timer(&topology_timer);
+ mod_timer(&topology_timer, topology_timer.expires);
}
+#ifdef CONFIG_SMP
+
+static void stage_topology_update(int core_id)
+{
+ cpumask_or(&cpu_associativity_changes_mask,
+ &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
+ reset_topology_timer();
+}
+
+static int dt_update_callback(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct of_prop_reconfig *update;
+ int rc = NOTIFY_DONE;
+
+ switch (action) {
+ case OF_RECONFIG_UPDATE_PROPERTY:
+ update = (struct of_prop_reconfig *)data;
+ if (!of_prop_cmp(update->dn->type, "cpu") &&
+ !of_prop_cmp(update->prop->name, "ibm,associativity")) {
+ u32 core_id;
+ of_property_read_u32(update->dn, "reg", &core_id);
+ stage_topology_update(core_id);
+ rc = NOTIFY_OK;
+ }
+ break;
+ }
+
+ return rc;
+}
+
+static struct notifier_block dt_update_nb = {
+ .notifier_call = dt_update_callback,
+};
+
+#endif
+
/*
- * Start polling for VPHN associativity changes.
+ * Start polling for associativity changes.
*/
int start_topology_update(void)
{
int rc = 0;
- /* Disabled until races with load balancing are fixed */
- if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
- get_lppaca()->shared_proc) {
- vphn_enabled = 1;
- setup_cpu_associativity_change_counters();
- init_timer_deferrable(&topology_timer);
- set_topology_timer();
- rc = 1;
+ if (firmware_has_feature(FW_FEATURE_PRRN)) {
+ if (!prrn_enabled) {
+ prrn_enabled = 1;
+ vphn_enabled = 0;
+#ifdef CONFIG_SMP
+ rc = of_reconfig_notifier_register(&dt_update_nb);
+#endif
+ }
+ } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
+ lppaca_shared_proc(get_lppaca())) {
+ if (!vphn_enabled) {
+ prrn_enabled = 0;
+ vphn_enabled = 1;
+ setup_cpu_associativity_change_counters();
+ init_timer_deferrable(&topology_timer);
+ reset_topology_timer();
+ }
}
return rc;
}
-__initcall(start_topology_update);
/*
* Disable polling for VPHN associativity changes.
*/
int stop_topology_update(void)
{
- vphn_enabled = 0;
- return del_timer_sync(&topology_timer);
+ int rc = 0;
+
+ if (prrn_enabled) {
+ prrn_enabled = 0;
+#ifdef CONFIG_SMP
+ rc = of_reconfig_notifier_unregister(&dt_update_nb);
+#endif
+ } else if (vphn_enabled) {
+ vphn_enabled = 0;
+ rc = del_timer_sync(&topology_timer);
+ }
+
+ return rc;
+}
+
+int prrn_is_enabled(void)
+{
+ return prrn_enabled;
+}
+
+static int topology_read(struct seq_file *file, void *v)
+{
+ if (vphn_enabled || prrn_enabled)
+ seq_puts(file, "on\n");
+ else
+ seq_puts(file, "off\n");
+
+ return 0;
+}
+
+static int topology_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, topology_read, NULL);
+}
+
+static ssize_t topology_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *off)
+{
+ char kbuf[4]; /* "on" or "off" plus null. */
+ int read_len;
+
+ read_len = count < 3 ? count : 3;
+ if (copy_from_user(kbuf, buf, read_len))
+ return -EINVAL;
+
+ kbuf[read_len] = '\0';
+
+ if (!strncmp(kbuf, "on", 2))
+ start_topology_update();
+ else if (!strncmp(kbuf, "off", 3))
+ stop_topology_update();
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static const struct file_operations topology_ops = {
+ .read = seq_read,
+ .write = topology_write,
+ .open = topology_open,
+ .release = single_release
+};
+
+static int topology_update_init(void)
+{
+ start_topology_update();
+ proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops);
+
+ return 0;
}
+device_initcall(topology_update_init);
#endif /* CONFIG_PPC_SPLPAR */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 214130a4edc..c695943a513 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -24,7 +24,6 @@
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/mm.h>
-#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
@@ -32,8 +31,6 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
-#include "mmu_decl.h"
-
static inline int is_exec_fault(void)
{
return current->thread.regs && TRAP(current->thread.regs) == 0x400;
@@ -72,7 +69,7 @@ struct page * maybe_pte_to_page(pte_t pte)
* support falls into the same category.
*/
-static pte_t set_pte_filter(pte_t pte, unsigned long addr)
+static pte_t set_pte_filter(pte_t pte)
{
pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
@@ -81,17 +78,6 @@ static pte_t set_pte_filter(pte_t pte, unsigned long addr)
if (!pg)
return pte;
if (!test_bit(PG_arch_1, &pg->flags)) {
-#ifdef CONFIG_8xx
- /* On 8xx, cache control instructions (particularly
- * "dcbst" from flush_dcache_icache) fault as write
- * operation if there is an unpopulated TLB entry
- * for the address in question. To workaround that,
- * we invalidate the TLB here, thus avoiding dcbst
- * misbehaviour.
- */
- /* 8xx doesn't care about PID, size or ind args */
- _tlbil_va(addr, 0, 0, 0);
-#endif /* CONFIG_8xx */
flush_dcache_icache_page(pg);
set_bit(PG_arch_1, &pg->flags);
}
@@ -111,7 +97,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
* instead we "filter out" the exec permission for non clean pages.
*/
-static pte_t set_pte_filter(pte_t pte, unsigned long addr)
+static pte_t set_pte_filter(pte_t pte)
{
struct page *pg;
@@ -187,13 +173,13 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte)
{
#ifdef CONFIG_DEBUG_VM
- WARN_ON(pte_present(*ptep));
+ WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
#endif
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
* is called.
*/
- pte = set_pte_filter(pte, addr);
+ pte = set_pte_filter(pte);
/* Perform the setting of the PTE */
__set_pte_at(mm, addr, ptep, pte, 0);
@@ -235,6 +221,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
pud = pud_offset(pgd, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
+ /*
+ * khugepaged to collapse normal pages to hugepage, first set
+ * pmd to none to force page fault/gup to take mmap_sem. After
+ * pmd is set to none, we do a pte_clear which does this assertion
+ * so if we find pmd none, return.
+ */
+ if (pmd_none(*pmd))
+ return;
BUG_ON(!pmd_present(*pmd));
assert_spin_locked(pte_lockptr(mm, pmd));
}
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 6c856fb8c15..343a87fa78b 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -121,7 +121,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
ptepage = alloc_pages(flags, 0);
if (!ptepage)
return NULL;
- pgtable_page_ctor(ptepage);
+ if (!pgtable_page_ctor(ptepage)) {
+ __free_page(ptepage);
+ return NULL;
+ }
return ptepage;
}
@@ -296,6 +299,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
__pgprot(flags)));
}
+ smp_wmb();
return err;
}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e212a271c7a..f6ce1f111f5 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -33,7 +33,6 @@
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
-#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/slab.h>
@@ -61,7 +60,7 @@
#endif
#ifdef CONFIG_PPC_STD_MMU_64
-#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT))
+#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
#error TASK_SIZE_USER64 exceeds user VSID range
#endif
#endif
@@ -153,6 +152,18 @@ int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
}
#endif /* !CONFIG_PPC_MMU_NOHASH */
}
+
+#ifdef CONFIG_PPC_BOOK3E_64
+ /*
+ * With hardware tablewalk, a sync is needed to ensure that
+ * subsequent accesses see the PTE we just wrote. Unlike userspace
+ * mappings, we can't tolerate spurious faults, so make sure
+ * the new PTE will be seen the first time.
+ */
+ mb();
+#else
+ smp_wmb();
+#endif
return 0;
}
@@ -337,3 +348,543 @@ EXPORT_SYMBOL(__ioremap_at);
EXPORT_SYMBOL(iounmap);
EXPORT_SYMBOL(__iounmap);
EXPORT_SYMBOL(__iounmap_at);
+
+/*
+ * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
+ * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
+ */
+struct page *pmd_page(pmd_t pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(pmd))
+ return pfn_to_page(pmd_pfn(pmd));
+#endif
+ return virt_to_page(pmd_page_vaddr(pmd));
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+static pte_t *get_from_cache(struct mm_struct *mm)
+{
+ void *pte_frag, *ret;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pte_frag;
+ if (ret) {
+ pte_frag = ret + PTE_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
+ pte_frag = NULL;
+ mm->context.pte_frag = pte_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pte_t *)ret;
+}
+
+static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
+{
+ void *ret = NULL;
+ struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
+ __GFP_REPEAT | __GFP_ZERO);
+ if (!page)
+ return NULL;
+ if (!kernel && !pgtable_page_ctor(page)) {
+ __free_page(page);
+ return NULL;
+ }
+
+ ret = page_address(page);
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!mm->context.pte_frag)) {
+ atomic_set(&page->_count, PTE_FRAG_NR);
+ mm->context.pte_frag = ret + PTE_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pte_t *)ret;
+}
+
+pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
+{
+ pte_t *pte;
+
+ pte = get_from_cache(mm);
+ if (pte)
+ return pte;
+
+ return __alloc_for_cache(mm, kernel);
+}
+
+void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
+{
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ if (!kernel)
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+#ifdef CONFIG_SMP
+static void page_table_free_rcu(void *table)
+{
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+ unsigned long pgf = (unsigned long)table;
+
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ pgf |= shift;
+ tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+ void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+ unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+ if (!shift)
+ /* PTE page needs special handling */
+ page_table_free_rcu(table);
+ else {
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ kmem_cache_free(PGT_CACHE(shift), table);
+ }
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+ if (!shift) {
+ /* PTE page needs special handling */
+ struct page *page = virt_to_page(table);
+ if (put_page_testzero(page)) {
+ pgtable_page_dtor(page);
+ free_hot_cold_page(page, 0);
+ }
+ } else {
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+ kmem_cache_free(PGT_CACHE(shift), table);
+ }
+}
+#endif
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp, pmd_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+ changed = !pmd_same(*(pmdp), entry);
+ if (changed) {
+ __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+ /*
+ * Since we are not supporting SW TLB systems, we don't
+ * have any thing similar to flush_tlb_page_nohash()
+ */
+ }
+ return changed;
+}
+
+unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set)
+{
+
+ unsigned long old, tmp;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ andi. %1,%0,%6\n\
+ bne- 1b \n\
+ andc %1,%0,%4 \n\
+ or %1,%1,%7\n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
+ : "cc" );
+#else
+ old = pmd_val(*pmdp);
+ *pmdp = __pmd((old & ~clr) | set);
+#endif
+ if (old & _PAGE_HASHPTE)
+ hpte_do_hugepage_flush(mm, addr, pmdp);
+ return old;
+}
+
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ if (pmd_trans_huge(*pmdp)) {
+ pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ } else {
+ /*
+ * khugepaged calls this for normal pmd
+ */
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+ /*
+ * Wait for all pending hash_page to finish. This is needed
+ * in case of subpage collapse. When we collapse normal pages
+ * to hugepage, we first clear the pmd, then invalidate all
+ * the PTE entries. The assumption here is that any low level
+ * page fault will see a none pmd and take the slow path that
+ * will wait on mmap_sem. But we could very well be in a
+ * hash_page with local ptep pointer value. Such a hash page
+ * can result in adding new HPTE entries for normal subpages.
+ * That means we could be modifying the page content as we
+ * copy them to a huge page. So wait for parallel hash_page
+ * to finish before invalidating HPTE entries. We can do this
+ * by sending an IPI to all the cpus and executing a dummy
+ * function there.
+ */
+ kick_all_cpus_sync();
+ /*
+ * Now invalidate the hpte entries in the range
+ * covered by pmd. This make sure we take a
+ * fault and will find the pmd as none, which will
+ * result in a major fault which takes mmap_sem and
+ * hence wait for collapse to complete. Without this
+ * the __collapse_huge_page_copy can result in copying
+ * the old content.
+ */
+ flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+ }
+ return pmd;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We mark the pmd splitting and invalidate all the hpte
+ * entries for this hugepage.
+ */
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ unsigned long old, tmp;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp));
+ assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ andi. %1,%0,%6\n\
+ bne- 1b \n\
+ ori %1,%0,%4 \n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
+ : "cc" );
+#else
+ old = pmd_val(*pmdp);
+ *pmdp = __pmd(old | _PAGE_SPLITTING);
+#endif
+ /*
+ * If we didn't had the splitting flag set, go and flush the
+ * HPTE entries.
+ */
+ if (!(old & _PAGE_SPLITTING)) {
+ /* We need to flush the hpte */
+ if (old & _PAGE_HASHPTE)
+ hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+ }
+ /*
+ * This ensures that generic code that rely on IRQ disabling
+ * to prevent a parallel THP split work as expected.
+ */
+ kick_all_cpus_sync();
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pgtable_t *pgtable_slot;
+ assert_spin_locked(&mm->page_table_lock);
+ /*
+ * we store the pgtable in the second half of PMD
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ *pgtable_slot = pgtable;
+ /*
+ * expose the deposited pgtable to other cpus.
+ * before we set the hugepage PTE at pmd level
+ * hash fault code looks at the deposted pgtable
+ * to store hash index values.
+ */
+ smp_wmb();
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pgtable_t pgtable;
+ pgtable_t *pgtable_slot;
+
+ assert_spin_locked(&mm->page_table_lock);
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Once we withdraw, mark the entry NULL.
+ */
+ *pgtable_slot = NULL;
+ /*
+ * We store HPTE information in the deposited PTE fragment.
+ * zero out the content on withdraw.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return pgtable;
+}
+
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
+ assert_spin_locked(&mm->page_table_lock);
+ WARN_ON(!pmd_trans_huge(pmd));
+#endif
+ return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ int ssize, i;
+ unsigned long s_addr;
+ int max_hpte_count;
+ unsigned int psize, valid;
+ unsigned char *hpte_slot_array;
+ unsigned long hidx, vpn, vsid, hash, shift, slot;
+
+ /*
+ * Flush all the hptes mapping this hugepage
+ */
+ s_addr = addr & HPAGE_PMD_MASK;
+ hpte_slot_array = get_hpte_slot_array(pmdp);
+ /*
+ * IF we try to do a HUGE PTE update after a withdraw is done.
+ * we will find the below NULL. This happens when we do
+ * split_huge_page_pmd
+ */
+ if (!hpte_slot_array)
+ return;
+
+ /* get the base page size */
+ psize = get_slice_psize(mm, s_addr);
+
+ if (ppc_md.hugepage_invalidate)
+ return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
+ s_addr, psize);
+ /*
+ * No bluk hpte removal support, invalidate each entry
+ */
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = HPAGE_PMD_SIZE >> shift;
+ for (i = 0; i < max_hpte_count; i++) {
+ /*
+ * 8 bits per each hpte entries
+ * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+ */
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_vsid(mm->context.id, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ ppc_md.hpte_invalidate(slot, vpn, psize,
+ MMU_PAGE_16M, ssize, 0);
+ }
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+ pmd_val(pmd) |= pgprot_val(pgprot);
+ return pmd;
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+ pmd_t pmd;
+ /*
+ * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
+ * set. We use this to check THP page at pmd level.
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
+ pmd_val(pmd) |= _PAGE_THP_HUGE;
+ pmd = pmd_set_protbits(pmd, pgprot);
+ return pmd;
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+ return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+
+ pmd_val(pmd) &= _HPAGE_CHG_MASK;
+ pmd = pmd_set_protbits(pmd, newprot);
+ return pmd;
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd)
+{
+ return;
+}
+
+pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old_pmd;
+ pgtable_t pgtable;
+ unsigned long old;
+ pgtable_t *pgtable_slot;
+
+ old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+ old_pmd = __pmd(old);
+ /*
+ * We have pmd == none and we are holding page_table_lock.
+ * So we can safely go and clear the pgtable hash
+ * index info.
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Let's zero out old valid and hash index details
+ * hash fault look at them.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return old_pmd;
+}
+
+int has_transparent_hugepage(void)
+{
+ if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+ return 0;
+ /*
+ * We support THP only if PMD_SIZE is 16MB.
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+ return 0;
+ /*
+ * We need to make sure that we support 16MB hugepage in a segement
+ * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+ * of 64K.
+ */
+ /*
+ * If we have 64K HPTE, we will be using that by default
+ */
+ if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+ (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+ return 0;
+ /*
+ * Ok we only have 4K HPTE
+ */
+ if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+ return 0;
+
+ return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index a538c80db2d..0399a670295 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -66,8 +66,10 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
* we only update the current CPU's SLB shadow buffer.
*/
get_slb_shadow()->save_area[entry].esid = 0;
- get_slb_shadow()->save_area[entry].vsid = mk_vsid_data(ea, ssize, flags);
- get_slb_shadow()->save_area[entry].esid = mk_esid_data(ea, ssize, entry);
+ get_slb_shadow()->save_area[entry].vsid =
+ cpu_to_be64(mk_vsid_data(ea, ssize, flags));
+ get_slb_shadow()->save_area[entry].esid =
+ cpu_to_be64(mk_esid_data(ea, ssize, entry));
}
static inline void slb_shadow_clear(unsigned long entry)
@@ -95,7 +97,7 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
static void __slb_flush_and_rebolt(void)
{
/* If you change this make sure you change SLB_NUM_BOLTED
- * appropriately too. */
+ * and PR KVM appropriately too. */
unsigned long linear_llp, vmalloc_llp, lflags, vflags;
unsigned long ksp_esid_data, ksp_vsid_data;
@@ -112,7 +114,8 @@ static void __slb_flush_and_rebolt(void)
} else {
/* Update stack entry; others don't change */
slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2);
- ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
+ ksp_vsid_data =
+ be64_to_cpu(get_slb_shadow()->save_area[2].vsid);
}
/* We need to do this all in asm, so we're sure we don't touch
@@ -253,10 +256,14 @@ static inline void patch_slb_encoding(unsigned int *insn_addr,
patch_instruction(insn_addr, insn);
}
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_linear[];
+extern u32 slb_miss_kernel_load_io[];
+extern u32 slb_compare_rr_to_size[];
+extern u32 slb_miss_kernel_load_vmemmap[];
+
void slb_set_size(u16 size)
{
- extern unsigned int *slb_compare_rr_to_size;
-
if (mmu_slb_size == size)
return;
@@ -269,11 +276,7 @@ void slb_initialize(void)
unsigned long linear_llp, vmalloc_llp, io_llp;
unsigned long lflags, vflags;
static int slb_encoding_inited;
- extern unsigned int *slb_miss_kernel_load_linear;
- extern unsigned int *slb_miss_kernel_load_io;
- extern unsigned int *slb_compare_rr_to_size;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
- extern unsigned int *slb_miss_kernel_load_vmemmap;
unsigned long vmemmap_llp;
#endif
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 1a16ca22775..736d18b3cef 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -31,10 +31,15 @@
* No other registers are examined or changed.
*/
_GLOBAL(slb_allocate_realmode)
- /* r3 = faulting address */
+ /*
+ * check for bad kernel/user address
+ * (ea & ~REGION_MASK) >= PGTABLE_RANGE
+ */
+ rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4)
+ bne- 8f
srdi r9,r3,60 /* get region */
- srdi r10,r3,28 /* get esid */
+ srdi r10,r3,SID_SHIFT /* get esid */
cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */
/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
@@ -54,14 +59,17 @@ _GLOBAL(slb_allocate_realmode)
/* Linear mapping encoding bits, the "li" instruction below will
* be patched by the kernel at boot
*/
-_GLOBAL(slb_miss_kernel_load_linear)
+.globl slb_miss_kernel_load_linear
+slb_miss_kernel_load_linear:
li r11,0
- li r9,0x1
/*
- * for 1T we shift 12 bits more. slb_finish_load_1T will do
- * the necessary adjustment
+ * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * r9 = region id.
*/
- rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
+ addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
+
BEGIN_FTR_SECTION
b slb_finish_load
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
@@ -72,7 +80,8 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
/* Check virtual memmap region. To be patches at kernel boot */
cmpldi cr0,r9,0xf
bne 1f
-_GLOBAL(slb_miss_kernel_load_vmemmap)
+.globl slb_miss_kernel_load_vmemmap
+slb_miss_kernel_load_vmemmap:
li r11,0
b 6f
1:
@@ -88,27 +97,23 @@ _GLOBAL(slb_miss_kernel_load_vmemmap)
b 6f
5:
/* IO mapping */
- _GLOBAL(slb_miss_kernel_load_io)
+.globl slb_miss_kernel_load_io
+slb_miss_kernel_load_io:
li r11,0
6:
- li r9,0x1
/*
- * for 1T we shift 12 bits more. slb_finish_load_1T will do
- * the necessary adjustment
+ * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * r9 = region id.
*/
- rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
+ addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+
BEGIN_FTR_SECTION
b slb_finish_load
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
b slb_finish_load_1T
-0: /* user address: proto-VSID = context << 15 | ESID. First check
- * if the address is within the boundaries of the user region
- */
- srdi. r9,r10,USER_ESID_BITS
- bne- 8f /* invalid ea bits set */
-
-
+0:
/* when using slices, we extract the psize off the slice bitmaps
* and then we need to get the sllp encoding off the mmu_psize_defs
* array.
@@ -164,15 +169,13 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
ld r9,PACACONTEXTID(r13)
BEGIN_FTR_SECTION
cmpldi r10,0x1000
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- rldimi r10,r9,USER_ESID_BITS,0
-BEGIN_FTR_SECTION
bge slb_finish_load_1T
END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
b slb_finish_load
8: /* invalid EA */
li r10,0 /* BAD_VSID */
+ li r9,0 /* BAD_VSID */
li r11,SLB_VSID_USER /* flags don't much matter */
b slb_finish_load
@@ -221,8 +224,6 @@ _GLOBAL(slb_allocate_user)
/* get context to calculate proto-VSID */
ld r9,PACACONTEXTID(r13)
- rldimi r10,r9,USER_ESID_BITS,0
-
/* fall through slb_finish_load */
#endif /* __DISABLED__ */
@@ -231,9 +232,10 @@ _GLOBAL(slb_allocate_user)
/*
* Finish loading of an SLB entry and return
*
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
+ * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
*/
slb_finish_load:
+ rldimi r10,r9,ESID_BITS,0
ASM_VSID_SCRAMBLE(r10,r9,256M)
/*
* bits above VSID_BITS_256M need to be ignored from r10
@@ -251,7 +253,8 @@ slb_finish_load:
7: ld r10,PACASTABRR(r13)
addi r10,r10,1
/* This gets soft patched on boot. */
-_GLOBAL(slb_compare_rr_to_size)
+.globl slb_compare_rr_to_size
+slb_compare_rr_to_size:
cmpldi r10,0
blt+ 4f
@@ -298,10 +301,11 @@ _GLOBAL(slb_compare_rr_to_size)
/*
* Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
*
- * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9
+ * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9
*/
slb_finish_load_1T:
- srdi r10,r10,40-28 /* get 1T ESID */
+ srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
+ rldimi r10,r9,ESID_BITS_1T,0
ASM_VSID_SCRAMBLE(r10,r9,1T)
/*
* bits above VSID_BITS_1T need to be ignored from r10
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index cf9dada734b..b0c75cc15ef 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -237,134 +237,112 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
#endif
}
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+ struct slice_mask available,
+ int end,
+ unsigned long *boundary_addr)
+{
+ unsigned long slice;
+ if (addr < SLICE_LOW_TOP) {
+ slice = GET_LOW_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+ return !!(available.low_slices & (1u << slice));
+ } else {
+ slice = GET_HIGH_SLICE_INDEX(addr);
+ *boundary_addr = (slice + end) ?
+ ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+ return !!(available.high_slices & (1ul << slice));
+ }
+}
+
static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize, int use_cache)
+ int psize)
{
- struct vm_area_struct *vma;
- unsigned long start_addr, addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
-
- if (use_cache) {
- if (len <= mm->cached_hole_size) {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- } else
- start_addr = addr = mm->free_area_cache;
- } else
- start_addr = addr = TASK_UNMAPPED_BASE;
-
-full_search:
- for (;;) {
- addr = _ALIGN_UP(addr, 1ul << pshift);
- if ((TASK_SIZE - len) < addr)
- break;
- vma = find_vma(mm, addr);
- BUG_ON(vma && (addr >= vma->vm_end));
-
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_LOW_SHIFT);
- else
- addr = _ALIGN_UP(addr + 1, 1ul << SLICE_HIGH_SHIFT);
+ unsigned long addr, found, next_end;
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
+
+ addr = TASK_UNMAPPED_BASE;
+ while (addr < TASK_SIZE) {
+ info.low_limit = addr;
+ if (!slice_scan_available(addr, available, 1, &addr))
continue;
+
+ next_slice:
+ /*
+ * At this point [info.low_limit; addr) covers
+ * available slices only and ends at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the next available slice.
+ */
+ if (addr >= TASK_SIZE)
+ addr = TASK_SIZE;
+ else if (slice_scan_available(addr, available, 1, &next_end)) {
+ addr = next_end;
+ goto next_slice;
}
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- if (use_cache)
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- }
+ info.high_limit = addr;
- /* Make sure we didn't miss any holes */
- if (use_cache && start_addr != TASK_UNMAPPED_BASE) {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- goto full_search;
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
+
return -ENOMEM;
}
static unsigned long slice_find_area_topdown(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize, int use_cache)
+ int psize)
{
- struct vm_area_struct *vma;
- unsigned long addr;
- struct slice_mask mask;
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+ unsigned long addr, found, prev;
+ struct vm_unmapped_area_info info;
- /* check if free_area_cache is useful for us */
- if (use_cache) {
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
- /* either no address requested or can't fit in requested
- * address hole
- */
- addr = mm->free_area_cache;
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
- mask = slice_range_to_mask(addr, len);
- if (slice_check_fit(mask, available) &&
- slice_area_is_free(mm, addr, len))
- /* remember the address as a hint for
- * next time
- */
- return (mm->free_area_cache = addr);
- }
- }
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+ info.align_offset = 0;
addr = mm->mmap_base;
- while (addr > len) {
- /* Go down by chunk size */
- addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
-
- /* Check for hit with different page size */
- mask = slice_range_to_mask(addr, len);
- if (!slice_check_fit(mask, available)) {
- if (addr < SLICE_LOW_TOP)
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
- else if (addr < (1ul << SLICE_HIGH_SHIFT))
- addr = SLICE_LOW_TOP;
- else
- addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+ while (addr > PAGE_SIZE) {
+ info.high_limit = addr;
+ if (!slice_scan_available(addr - 1, available, 0, &addr))
continue;
- }
+ prev_slice:
/*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
+ * At this point [addr; info.high_limit) covers
+ * available slices only and starts at a slice boundary.
+ * Check if we need to reduce the range, or if we can
+ * extend it to cover the previous available slice.
*/
- vma = find_vma(mm, addr);
- if (!vma || (addr + len) <= vma->vm_start) {
- /* remember the address as a hint for next time */
- if (use_cache)
- mm->free_area_cache = addr;
- return addr;
+ if (addr < PAGE_SIZE)
+ addr = PAGE_SIZE;
+ else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+ addr = prev;
+ goto prev_slice;
}
+ info.low_limit = addr;
- /* remember the largest hole we saw so far */
- if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start;
+ found = vm_unmapped_area(&info);
+ if (!(found & ~PAGE_MASK))
+ return found;
}
/*
@@ -373,28 +351,18 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
* can happen with large stack limits and large mmap()
* allocations.
*/
- addr = slice_find_area_bottomup(mm, len, available, psize, 0);
-
- /*
- * Restore the topdown base:
- */
- if (use_cache) {
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
- }
-
- return addr;
+ return slice_find_area_bottomup(mm, len, available, psize);
}
static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
struct slice_mask mask, int psize,
- int topdown, int use_cache)
+ int topdown)
{
if (topdown)
- return slice_find_area_topdown(mm, len, mask, psize, use_cache);
+ return slice_find_area_topdown(mm, len, mask, psize);
else
- return slice_find_area_bottomup(mm, len, mask, psize, use_cache);
+ return slice_find_area_bottomup(mm, len, mask, psize);
}
#define or_mask(dst, src) do { \
@@ -415,7 +383,7 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
unsigned long flags, unsigned int psize,
- int topdown, int use_cache)
+ int topdown)
{
struct slice_mask mask = {0, 0};
struct slice_mask good_mask;
@@ -430,8 +398,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
BUG_ON(mm->task_size == 0);
slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
- slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n",
- addr, len, flags, topdown, use_cache);
+ slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
+ addr, len, flags, topdown);
if (len > mm->task_size)
return -ENOMEM;
@@ -440,7 +408,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
if (fixed && (addr & ((1ul << pshift) - 1)))
return -EINVAL;
if (fixed && addr > (mm->task_size - len))
- return -EINVAL;
+ return -ENOMEM;
/* If hint, make sure it matches our alignment restrictions */
if (!fixed && addr) {
@@ -503,8 +471,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* Now let's see if we can find something in the existing
* slices for that size
*/
- newaddr = slice_find_area(mm, len, good_mask, psize, topdown,
- use_cache);
+ newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
if (newaddr != -ENOMEM) {
/* Found within the good mask, we don't have to setup,
* we thus return directly
@@ -536,8 +503,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
* anywhere in the good area.
*/
if (addr) {
- addr = slice_find_area(mm, len, good_mask, psize, topdown,
- use_cache);
+ addr = slice_find_area(mm, len, good_mask, psize, topdown);
if (addr != -ENOMEM) {
slice_dbg(" found area at 0x%lx\n", addr);
return addr;
@@ -547,15 +513,14 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* Now let's see if we can find something in the existing slices
* for that size plus free slices
*/
- addr = slice_find_area(mm, len, potential_mask, psize, topdown,
- use_cache);
+ addr = slice_find_area(mm, len, potential_mask, psize, topdown);
#ifdef CONFIG_PPC_64K_PAGES
if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
/* retry the search with 4k-page slices included */
or_mask(potential_mask, compat_mask);
addr = slice_find_area(mm, len, potential_mask, psize,
- topdown, use_cache);
+ topdown);
}
#endif
@@ -586,8 +551,7 @@ unsigned long arch_get_unmapped_area(struct file *filp,
unsigned long flags)
{
return slice_get_unmapped_area(addr, len, flags,
- current->mm->context.user_psize,
- 0, 1);
+ current->mm->context.user_psize, 0);
}
unsigned long arch_get_unmapped_area_topdown(struct file *filp,
@@ -597,8 +561,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
const unsigned long flags)
{
return slice_get_unmapped_area(addr0, len, flags,
- current->mm->context.user_psize,
- 1, 1);
+ current->mm->context.user_psize, 1);
}
unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 7c415ddde94..6c0b1f5f8d2 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -78,7 +78,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
for (; npages > 0; --npages) {
- pte_update(mm, addr, pte, 0, 0);
+ pte_update(mm, addr, pte, 0, 0, 0);
addr += PAGE_SIZE;
++pte;
}
@@ -105,7 +105,7 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
limit = spt->maxaddr;
for (; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
- if (addr < 0x100000000) {
+ if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
@@ -130,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
up_write(&mm->mmap_sem);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->private;
+ split_huge_page_pmd(vma, addr, pmd);
+ return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ struct vm_area_struct *vma;
+ struct mm_walk subpage_proto_walk = {
+ .mm = mm,
+ .pmd_entry = subpage_walk_pmd_entry,
+ };
+
+ /*
+ * We don't try too hard, we just mark all the vma in that range
+ * VM_NOHUGEPAGE and split them.
+ */
+ vma = find_vma(mm, addr);
+ /*
+ * If the range is in unmapped range, just return
+ */
+ if (vma && ((addr + len) <= vma->vm_start))
+ return;
+
+ while (vma) {
+ if (vma->vm_start >= (addr + len))
+ break;
+ vma->vm_flags |= VM_NOHUGEPAGE;
+ subpage_proto_walk.private = vma;
+ walk_page_range(vma->vm_start, vma->vm_end,
+ &subpage_proto_walk);
+ vma = vma->vm_next;
+ }
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ return;
+}
+#endif
+
/*
* Copy in a subpage protection map for an address range.
* The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -168,10 +215,11 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
return -EFAULT;
down_write(&mm->mmap_sem);
+ subpage_mark_vma_nohuge(mm, addr, len);
for (limit = addr + len; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
err = -ENOMEM;
- if (addr < 0x100000000) {
+ if (addr < 0x100000000UL) {
spm = spt->low_prot;
} else {
spm = spt->protptrs[addr >> SBP_L3_SHIFT];
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 0d82ef50dc3..c99f6510a0b 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -23,7 +23,6 @@
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <asm/pgalloc.h>
@@ -82,11 +81,11 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
vsid = get_vsid(mm->context.id, addr, ssize);
- WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
+ WARN_ON(vsid == 0);
vpn = hpt_vpn(addr, vsid, ssize);
rpte = __real_pte(__pte(pte), ptep);
@@ -183,12 +182,13 @@ void tlb_flush(struct mmu_gather *tlb)
* since 64K pages may overlap with other bridges when using 64K pages
* with 4K HW pages on IO space.
*
- * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
- * and is implemented for small size rather than speed.
+ * Because of that usage pattern, it is implemented for small size rather
+ * than speed.
*/
void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
+ int hugepage_shift;
unsigned long flags;
start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -206,7 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_linux_pte(mm->pgd, start);
+ pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
+ &hugepage_shift);
unsigned long pte;
if (ptep == NULL)
@@ -214,7 +215,37 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
pte = pte_val(*ptep);
if (!(pte & _PAGE_HASHPTE))
continue;
- hpte_need_flush(mm, start, ptep, pte, 0);
+ if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
+ hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+ else
+ hpte_need_flush(mm, start, ptep, pte, 0);
+ }
+ arch_leave_lazy_mmu_mode();
+ local_irq_restore(flags);
+}
+
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+ pte_t *pte;
+ pte_t *start_pte;
+ unsigned long flags;
+
+ addr = _ALIGN_DOWN(addr, PMD_SIZE);
+ /* Note: Normally, we should only ever use a batch within a
+ * PTE locked section. This violates the rule, but will work
+ * since we don't actually modify the PTEs, we just flush the
+ * hash while leaving the PTEs intact (including their reference
+ * to being hashed). This is not the most performance oriented
+ * way to do things but is fine for our needs here.
+ */
+ local_irq_save(flags);
+ arch_enter_lazy_mmu_mode();
+ start_pte = pte_offset_map(pmd, addr);
+ for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+ unsigned long pteval = pte_val(*pte);
+ if (pteval & _PAGE_HASHPTE)
+ hpte_need_flush(mm, addr, pte, pteval, 0);
+ addr += PAGE_SIZE;
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index b4113bf8635..356e8b41fb0 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -39,37 +39,49 @@
* *
**********************************************************************/
+/*
+ * Note that, unlike non-bolted handlers, TLB_EXFRAME is not
+ * modified by the TLB miss handlers themselves, since the TLB miss
+ * handler code will not itself cause a recursive TLB miss.
+ *
+ * TLB_EXFRAME will be modified when crit/mc/debug exceptions are
+ * entered/exited.
+ */
.macro tlb_prolog_bolted intnum addr
- mtspr SPRN_SPRG_GEN_SCRATCH,r13
+ mtspr SPRN_SPRG_GEN_SCRATCH,r12
+ mfspr r12,SPRN_SPRG_TLB_EXFRAME
+ std r13,EX_TLB_R13(r12)
+ std r10,EX_TLB_R10(r12)
mfspr r13,SPRN_SPRG_PACA
- std r10,PACA_EXTLB+EX_TLB_R10(r13)
+
mfcr r10
- std r11,PACA_EXTLB+EX_TLB_R11(r13)
+ std r11,EX_TLB_R11(r12)
#ifdef CONFIG_KVM_BOOKE_HV
BEGIN_FTR_SECTION
mfspr r11, SPRN_SRR1
END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
#endif
DO_KVM \intnum, SPRN_SRR1
- std r16,PACA_EXTLB+EX_TLB_R16(r13)
+ std r16,EX_TLB_R16(r12)
mfspr r16,\addr /* get faulting address */
- std r14,PACA_EXTLB+EX_TLB_R14(r13)
+ std r14,EX_TLB_R14(r12)
ld r14,PACAPGD(r13)
- std r15,PACA_EXTLB+EX_TLB_R15(r13)
- std r10,PACA_EXTLB+EX_TLB_CR(r13)
- TLB_MISS_PROLOG_STATS_BOLTED
+ std r15,EX_TLB_R15(r12)
+ std r10,EX_TLB_CR(r12)
+ TLB_MISS_PROLOG_STATS
.endm
.macro tlb_epilog_bolted
- ld r14,PACA_EXTLB+EX_TLB_CR(r13)
- ld r10,PACA_EXTLB+EX_TLB_R10(r13)
- ld r11,PACA_EXTLB+EX_TLB_R11(r13)
+ ld r14,EX_TLB_CR(r12)
+ ld r10,EX_TLB_R10(r12)
+ ld r11,EX_TLB_R11(r12)
+ ld r13,EX_TLB_R13(r12)
mtcr r14
- ld r14,PACA_EXTLB+EX_TLB_R14(r13)
- ld r15,PACA_EXTLB+EX_TLB_R15(r13)
- TLB_MISS_RESTORE_STATS_BOLTED
- ld r16,PACA_EXTLB+EX_TLB_R16(r13)
- mfspr r13,SPRN_SPRG_GEN_SCRATCH
+ ld r14,EX_TLB_R14(r12)
+ ld r15,EX_TLB_R15(r12)
+ TLB_MISS_RESTORE_STATS
+ ld r16,EX_TLB_R16(r12)
+ mfspr r12,SPRN_SPRG_GEN_SCRATCH
.endm
/* Data TLB miss */
@@ -136,7 +148,7 @@ BEGIN_MMU_FTR_SECTION
*/
PPC_TLBSRX_DOT(0,R16)
ldx r14,r14,r15 /* grab pgd entry */
- beq normal_tlb_miss_done /* tlb exists already, bail */
+ beq tlb_miss_done_bolted /* tlb exists already, bail */
MMU_FTR_SECTION_ELSE
ldx r14,r14,r15 /* grab pgd entry */
ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
@@ -192,6 +204,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
mtspr SPRN_MAS7_MAS3,r15
tlbwe
+tlb_miss_done_bolted:
TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
tlb_epilog_bolted
rfi
@@ -239,6 +252,183 @@ itlb_miss_fault_bolted:
beq tlb_miss_common_bolted
b itlb_miss_kernel_bolted
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/*
+ * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
+ *
+ * Linear mapping is bolted: no virtual page table or nested TLB misses
+ * Indirect entries in TLB1, hardware loads resulting direct entries
+ * into TLB0
+ * No HES or NV hint on TLB1, so we need to do software round-robin
+ * No tlbsrx. so we need a spinlock, and we have to deal
+ * with MAS-damage caused by tlbsx
+ * 4K pages only
+ */
+
+ START_EXCEPTION(instruction_tlb_miss_e6500)
+ tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+ ld r11,PACA_TCD_PTR(r13)
+ srdi. r15,r16,60 /* get region */
+ ori r16,r16,1
+
+ TLB_MISS_STATS_SAVE_INFO_BOLTED
+ bne tlb_miss_kernel_e6500 /* user/kernel test */
+
+ b tlb_miss_common_e6500
+
+ START_EXCEPTION(data_tlb_miss_e6500)
+ tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+ ld r11,PACA_TCD_PTR(r13)
+ srdi. r15,r16,60 /* get region */
+ rldicr r16,r16,0,62
+
+ TLB_MISS_STATS_SAVE_INFO_BOLTED
+ bne tlb_miss_kernel_e6500 /* user vs kernel check */
+
+/*
+ * This is the guts of the TLB miss handler for e6500 and derivatives.
+ * We are entered with:
+ *
+ * r16 = page of faulting address (low bit 0 if data, 1 if instruction)
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = tlb_per_core ptr
+ * r10 = cpu number
+ */
+tlb_miss_common_e6500:
+ /*
+ * Search if we already have an indirect entry for that virtual
+ * address, and if we do, bail out.
+ *
+ * MAS6:IND should be already set based on MAS4
+ */
+1: lbarx r15,0,r11
+ lhz r10,PACAPACAINDEX(r13)
+ cmpdi r15,0
+ cmpdi cr1,r15,1 /* set cr1.eq = 0 for non-recursive */
+ bne 2f
+ stbcx. r10,0,r11
+ bne 1b
+3:
+ .subsection 1
+2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */
+ beq cr1,3b /* unlock will happen if cr1.eq = 0 */
+ lbz r15,0(r11)
+ cmpdi r15,0
+ bne 2b
+ b 1b
+ .previous
+
+ mfspr r15,SPRN_MAS2
+
+ tlbsx 0,r16
+ mfspr r10,SPRN_MAS1
+ andis. r10,r10,MAS1_VALID@h
+ bne tlb_miss_done_e6500
+
+ /* Undo MAS-damage from the tlbsx */
+ mfspr r10,SPRN_MAS1
+ oris r10,r10,MAS1_VALID@h
+ mtspr SPRN_MAS1,r10
+ mtspr SPRN_MAS2,r15
+
+ /* Now, we need to walk the page tables. First check if we are in
+ * range.
+ */
+ rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+ bne- tlb_miss_fault_e6500
+
+ rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+ cmpldi cr0,r14,0
+ clrrdi r15,r15,3
+ beq- tlb_miss_fault_e6500 /* No PGDIR, bail */
+ ldx r14,r14,r15 /* grab pgd entry */
+
+ rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_e6500 /* Bad pgd entry or hugepage; bail */
+ ldx r14,r14,r15 /* grab pud entry */
+
+ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+ clrrdi r15,r15,3
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_e6500
+ ldx r14,r14,r15 /* Grab pmd entry */
+
+ mfspr r10,SPRN_MAS0
+ cmpdi cr0,r14,0
+ bge tlb_miss_fault_e6500
+
+ /* Now we build the MAS for a 2M indirect page:
+ *
+ * MAS 0 : ESEL needs to be filled by software round-robin
+ * MAS 1 : Fully set up
+ * - PID already updated by caller if necessary
+ * - TSIZE for now is base ind page size always
+ * - TID already cleared if necessary
+ * MAS 2 : Default not 2M-aligned, need to be redone
+ * MAS 3+7 : Needs to be done
+ */
+
+ ori r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+ mtspr SPRN_MAS7_MAS3,r14
+
+ clrrdi r15,r16,21 /* make EA 2M-aligned */
+ mtspr SPRN_MAS2,r15
+
+ lbz r15,TCD_ESEL_NEXT(r11)
+ lbz r16,TCD_ESEL_MAX(r11)
+ lbz r14,TCD_ESEL_FIRST(r11)
+ rlwimi r10,r15,16,0x00ff0000 /* insert esel_next into MAS0 */
+ addi r15,r15,1 /* increment esel_next */
+ mtspr SPRN_MAS0,r10
+ cmpw r15,r16
+ iseleq r15,r14,r15 /* if next == last use first */
+ stb r15,TCD_ESEL_NEXT(r11)
+
+ tlbwe
+
+tlb_miss_done_e6500:
+ .macro tlb_unlock_e6500
+ beq cr1,1f /* no unlock if lock was recursively grabbed */
+ li r15,0
+ isync
+ stb r15,0(r11)
+1:
+ .endm
+
+ tlb_unlock_e6500
+ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+ tlb_epilog_bolted
+ rfi
+
+tlb_miss_kernel_e6500:
+ mfspr r10,SPRN_MAS1
+ ld r14,PACA_KERNELPGD(r13)
+ cmpldi cr0,r15,8 /* Check for vmalloc region */
+ rlwinm r10,r10,0,16,1 /* Clear TID */
+ mtspr SPRN_MAS1,r10
+ beq+ tlb_miss_common_e6500
+
+tlb_miss_fault_e6500:
+ tlb_unlock_e6500
+ /* We need to check if it was an instruction miss */
+ andi. r16,r16,1
+ bne itlb_miss_fault_e6500
+dtlb_miss_fault_e6500:
+ TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+ tlb_epilog_bolted
+ b exc_data_storage_book3e
+itlb_miss_fault_e6500:
+ TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+ tlb_epilog_bolted
+ b exc_instruction_storage_book3e
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
/**********************************************************************
* *
* TLB miss handling for Book3E with TLB reservation and HES support *
@@ -918,7 +1108,8 @@ tlb_load_linear:
ld r11,PACATOC(r13)
ld r11,linear_map_top@got(r11)
ld r10,0(r11)
- cmpld cr0,r10,r16
+ tovirt(10,10)
+ cmpld cr0,r16,r10
bge tlb_load_linear_fault
/* MAS1 need whole new setup. */
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index df32a838dcf..92cb18d52ea 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -43,6 +43,7 @@
#include <asm/tlb.h>
#include <asm/code-patching.h>
#include <asm/hugetlb.h>
+#include <asm/paca.h>
#include "mmu_decl.h"
@@ -58,6 +59,10 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
.shift = 12,
.enc = BOOK3E_PAGESZ_4K,
},
+ [MMU_PAGE_2M] = {
+ .shift = 21,
+ .enc = BOOK3E_PAGESZ_2M,
+ },
[MMU_PAGE_4M] = {
.shift = 22,
.enc = BOOK3E_PAGESZ_4M,
@@ -136,9 +141,18 @@ static inline int mmu_get_tsize(int psize)
int mmu_linear_psize; /* Page size used for the linear mapping */
int mmu_pte_psize; /* Page size used for PTE pages */
int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
-int book3e_htw_enabled; /* Is HW tablewalk enabled ? */
+int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
unsigned long linear_map_top; /* Top of linear mapping */
+
+/*
+ * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
+ * exceptions. This is used for bolted and e6500 TLB miss handlers which
+ * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
+ * this is set to zero.
+ */
+int extlb_level_exc;
+
#endif /* CONFIG_PPC64 */
#ifdef CONFIG_PPC_FSL_BOOK3E
@@ -305,7 +319,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
#ifdef CONFIG_HUGETLB_PAGE
- if (is_vm_hugetlb_page(vma))
+ if (vma && is_vm_hugetlb_page(vma))
flush_hugetlb_page(vma, vmaddr);
#endif
@@ -377,7 +391,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
{
int tsize = mmu_psize_defs[mmu_pte_psize].enc;
- if (book3e_htw_enabled) {
+ if (book3e_htw_mode != PPC_HTW_NONE) {
unsigned long start = address & PMD_MASK;
unsigned long end = address + PMD_SIZE;
unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
@@ -414,9 +428,9 @@ static void setup_page_sizes(void)
#ifdef CONFIG_PPC_FSL_BOOK3E
unsigned int mmucfg = mfspr(SPRN_MMUCFG);
+ int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
- if (((mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) &&
- (mmu_has_feature(MMU_FTR_TYPE_FSL_E))) {
+ if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
unsigned int min_pg, max_pg;
@@ -430,7 +444,7 @@ static void setup_page_sizes(void)
def = &mmu_psize_defs[psize];
shift = def->shift;
- if (shift == 0)
+ if (shift == 0 || shift & 1)
continue;
/* adjust to be in terms of 4^shift Kb */
@@ -440,7 +454,40 @@ static void setup_page_sizes(void)
def->flags |= MMU_PAGE_SIZE_DIRECT;
}
- goto no_indirect;
+ goto out;
+ }
+
+ if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
+ u32 tlb1cfg, tlb1ps;
+
+ tlb0cfg = mfspr(SPRN_TLB0CFG);
+ tlb1cfg = mfspr(SPRN_TLB1CFG);
+ tlb1ps = mfspr(SPRN_TLB1PS);
+ eptcfg = mfspr(SPRN_EPTCFG);
+
+ if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+ book3e_htw_mode = PPC_HTW_E6500;
+
+ /*
+ * We expect 4K subpage size and unrestricted indirect size.
+ * The lack of a restriction on indirect size is a Freescale
+ * extension, indicated by PSn = 0 but SPSn != 0.
+ */
+ if (eptcfg != 2)
+ book3e_htw_mode = PPC_HTW_NONE;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (tlb1ps & (1U << (def->shift - 10))) {
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+ if (book3e_htw_mode && psize == MMU_PAGE_2M)
+ def->flags |= MMU_PAGE_SIZE_INDIRECT;
+ }
+ }
+
+ goto out;
}
#endif
@@ -457,8 +504,11 @@ static void setup_page_sizes(void)
}
/* Indirect page sizes supported ? */
- if ((tlb0cfg & TLBnCFG_IND) == 0)
- goto no_indirect;
+ if ((tlb0cfg & TLBnCFG_IND) == 0 ||
+ (tlb0cfg & TLBnCFG_PT) == 0)
+ goto out;
+
+ book3e_htw_mode = PPC_HTW_IBM;
/* Now, we only deal with one IND page size for each
* direct size. Hopefully all implementations today are
@@ -483,8 +533,8 @@ static void setup_page_sizes(void)
def->ind = ps + 10;
}
}
- no_indirect:
+out:
/* Cleanup array and print summary */
pr_info("MMU: Supported page sizes\n");
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
@@ -504,44 +554,28 @@ static void setup_page_sizes(void)
}
}
-static void __patch_exception(int exc, unsigned long addr)
-{
- extern unsigned int interrupt_base_book3e;
- unsigned int *ibase = &interrupt_base_book3e;
-
- /* Our exceptions vectors start with a NOP and -then- a branch
- * to deal with single stepping from userspace which stops on
- * the second instruction. Thus we need to patch the second
- * instruction of the exception, not the first one
- */
-
- patch_branch(ibase + (exc / 4) + 1, addr, 0);
-}
-
-#define patch_exception(exc, name) do { \
- extern unsigned int name; \
- __patch_exception((exc), (unsigned long)&name); \
-} while (0)
-
static void setup_mmu_htw(void)
{
- /* Check if HW tablewalk is present, and if yes, enable it by:
- *
- * - patching the TLB miss handlers to branch to the
- * one dedicates to it
- *
- * - setting the global book3e_htw_enabled
- */
- unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+ /*
+ * If we want to use HW tablewalk, enable it by patching the TLB miss
+ * handlers to branch to the one dedicated to it.
+ */
- if ((tlb0cfg & TLBnCFG_IND) &&
- (tlb0cfg & TLBnCFG_PT)) {
+ switch (book3e_htw_mode) {
+ case PPC_HTW_IBM:
patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
- book3e_htw_enabled = 1;
+ break;
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ case PPC_HTW_E6500:
+ extlb_level_exc = EX_TLB_SIZE;
+ patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+ patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+ break;
+#endif
}
pr_info("MMU: Book3E HW tablewalk %s\n",
- book3e_htw_enabled ? "enabled" : "not supported");
+ book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
}
/*
@@ -562,8 +596,13 @@ static void __early_init_mmu(int boot_cpu)
/* XXX This should be decided at runtime based on supported
* page sizes in the TLB, but for now let's assume 16M is
* always there and a good fit (which it probably is)
+ *
+ * Freescale booke only supports 4K pages in TLB0, so use that.
*/
- mmu_vmemmap_psize = MMU_PAGE_16M;
+ if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+ mmu_vmemmap_psize = MMU_PAGE_4K;
+ else
+ mmu_vmemmap_psize = MMU_PAGE_16M;
/* XXX This code only checks for TLB 0 capabilities and doesn't
* check what page size combos are supported by the HW. It
@@ -581,8 +620,16 @@ static void __early_init_mmu(int boot_cpu)
/* Set MAS4 based on page table setting */
mas4 = 0x4 << MAS4_WIMGED_SHIFT;
- if (book3e_htw_enabled) {
- mas4 |= mas4 | MAS4_INDD;
+ switch (book3e_htw_mode) {
+ case PPC_HTW_E6500:
+ mas4 |= MAS4_INDD;
+ mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+ mas4 |= MAS4_TLBSELD(1);
+ mmu_pte_psize = MMU_PAGE_2M;
+ break;
+
+ case PPC_HTW_IBM:
+ mas4 |= MAS4_INDD;
#ifdef CONFIG_PPC_64K_PAGES
mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
mmu_pte_psize = MMU_PAGE_256M;
@@ -590,13 +637,16 @@ static void __early_init_mmu(int boot_cpu)
mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
mmu_pte_psize = MMU_PAGE_1M;
#endif
- } else {
+ break;
+
+ case PPC_HTW_NONE:
#ifdef CONFIG_PPC_64K_PAGES
mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
#else
mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
#endif
mmu_pte_psize = mmu_virtual_psize;
+ break;
}
mtspr(SPRN_MAS4, mas4);
@@ -616,8 +666,12 @@ static void __early_init_mmu(int boot_cpu)
/* limit memory so we dont have linear faults */
memblock_enforce_memory_limit(linear_map_top);
- patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
- patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e);
+ if (book3e_htw_mode == PPC_HTW_NONE) {
+ extlb_level_exc = EX_TLB_SIZE;
+ patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
+ patch_exception(0x1e0,
+ exc_instruction_tlb_miss_bolted_book3e);
+ }
}
#endif
@@ -634,7 +688,7 @@ void __init early_init_mmu(void)
__early_init_mmu(1);
}
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
{
__early_init_mmu(0);
}
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 626ad081639..43ff3c797fb 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -402,7 +402,9 @@ _GLOBAL(set_context)
* Load TLBCAM[index] entry in to the L2 CAM MMU
*/
_GLOBAL(loadcam_entry)
- LOAD_REG_ADDR(r4, TLBCAM)
+ mflr r5
+ LOAD_REG_ADDR_PIC(r4, TLBCAM)
+ mtlr r5
mulli r5,r3,TLBCAM_SIZE
add r3,r5,r4
lwz r4,TLBCAM_MAS0(r3)