diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-15 09:51:09 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-15 09:51:09 -0700 |
commit | 723e9db7a46e328527cc3da2b478b831184fe828 (patch) | |
tree | cdeda255633057dcb4c84097bed27b2bbf76970f /arch/powerpc/mm | |
parent | ada3fa15057205b7d3f727bba5cd26b5912e350f (diff) | |
parent | d331d8305cba713605854aab63a000fb892353a7 (diff) |
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
* 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (134 commits)
powerpc/nvram: Enable use Generic NVRAM driver for different size chips
powerpc/iseries: Fix oops reading from /proc/iSeries/mf/*/cmdline
powerpc/ps3: Workaround for flash memory I/O error
powerpc/booke: Don't set DABR on 64-bit BookE, use DAC1 instead
powerpc/perf_counters: Reduce stack usage of power_check_constraints
powerpc: Fix bug where perf_counters breaks oprofile
powerpc/85xx: Fix SMP compile error and allow NULL for smp_ops
powerpc/irq: Improve nanodoc
powerpc: Fix some late PowerMac G5 with PCIe ATI graphics
powerpc/fsl-booke: Use HW PTE format if CONFIG_PTE_64BIT
powerpc/book3e: Add missing page sizes
powerpc/pseries: Fix to handle slb resize across migration
powerpc/powermac: Thermal control turns system off too eagerly
powerpc/pci: Merge ppc32 and ppc64 versions of phb_scan()
powerpc/405ex: support cuImage via included dtb
powerpc/405ex: provide necessary fixup function to support cuImage
powerpc/40x: Add support for the ESTeem 195E (PPC405EP) SBC
powerpc/44x: Add Eiger AMCC (AppliedMicro) PPC460SX evaluation board support.
powerpc/44x: Update Arches defconfig
powerpc/44x: Update Arches dts
...
Fix up conflicts in drivers/char/agp/uninorth-agp.c
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r-- | arch/powerpc/mm/40x_mmu.c | 4 | ||||
-rw-r--r-- | arch/powerpc/mm/Makefile | 1 | ||||
-rw-r--r-- | arch/powerpc/mm/fsl_booke_mmu.c | 2 | ||||
-rw-r--r-- | arch/powerpc/mm/hash_low_32.S | 4 | ||||
-rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 8 | ||||
-rw-r--r-- | arch/powerpc/mm/init_32.c | 2 | ||||
-rw-r--r-- | arch/powerpc/mm/init_64.c | 55 | ||||
-rw-r--r-- | arch/powerpc/mm/mmu_context_nohash.c | 96 | ||||
-rw-r--r-- | arch/powerpc/mm/mmu_decl.h | 37 | ||||
-rw-r--r-- | arch/powerpc/mm/pgtable.c | 179 | ||||
-rw-r--r-- | arch/powerpc/mm/pgtable_32.c | 2 | ||||
-rw-r--r-- | arch/powerpc/mm/pgtable_64.c | 59 | ||||
-rw-r--r-- | arch/powerpc/mm/slb.c | 46 | ||||
-rw-r--r-- | arch/powerpc/mm/tlb_hash32.c | 3 | ||||
-rw-r--r-- | arch/powerpc/mm/tlb_hash64.c | 20 | ||||
-rw-r--r-- | arch/powerpc/mm/tlb_low_64e.S | 770 | ||||
-rw-r--r-- | arch/powerpc/mm/tlb_nohash.c | 268 | ||||
-rw-r--r-- | arch/powerpc/mm/tlb_nohash_low.S | 87 |
18 files changed, 1486 insertions, 157 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index 29954dc2894..f5e7b9ce63d 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -105,7 +105,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_16M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp++) = val; @@ -120,7 +120,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_4M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp) = val; diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3e68363405b..6fb8fc8d2fe 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,6 +13,7 @@ obj-y := fault.o mem.o pgtable.o gup.o \ pgtable_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o +obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o obj-$(CONFIG_PPC64) += mmap_64.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index bb3d65998e6..dc93e95b256 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -161,7 +161,7 @@ unsigned long __init mmu_mapin_ram(void) unsigned long virt = PAGE_OFFSET; phys_addr_t phys = memstart_addr; - while (cam[tlbcam_index] && tlbcam_index < ARRAY_SIZE(cam)) { + while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) { settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0); virt += cam[tlbcam_index]; phys += cam[tlbcam_index]; diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 14af8cedab7..b13d58932bf 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -40,7 +40,7 @@ mmu_hash_lock: * The address is in r4, and r3 contains an access flag: * _PAGE_RW (0x400) if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. - * SPRG3 contains the physical address of the current task's thread. + * SPRG_THREAD contains the physical address of the current task's thread. * * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE @@ -68,7 +68,7 @@ _GLOBAL(hash_page) /* Get PTE (linux-style) and check access */ lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 - mfspr r8,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ lwz r5,PGDIR(r8) /* virt page-table root */ blt+ 112f /* assume user more likely */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index c46ef2ffa3d..90df6ffe3a4 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -57,8 +57,10 @@ unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { - "unused_4K", "hugepte_cache_64K", "unused_64K_AP", - "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G" + [MMU_PAGE_64K] = "hugepte_cache_64K", + [MMU_PAGE_1M] = "hugepte_cache_1M", + [MMU_PAGE_16M] = "hugepte_cache_16M", + [MMU_PAGE_16G] = "hugepte_cache_16G", }; /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() @@ -700,6 +702,8 @@ static void __init set_huge_psize(int psize) if (mmu_huge_psizes[psize] || mmu_psize_defs[psize].shift == PAGE_SHIFT) return; + if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL)) + return; hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); switch (mmu_psize_defs[psize].shift) { diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3de6a0d9382..3ef5084b90c 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -54,8 +54,6 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - phys_addr_t total_memory; phys_addr_t total_lowmem; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 68a821add28..31582329cd6 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -205,6 +205,47 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } +/* On hash-based CPUs, the vmemmap is bolted in the hash table. + * + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ + +#ifdef CONFIG_PPC_BOOK3E +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); +} +#else /* CONFIG_PPC_BOOK3E */ +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int mapped = htab_bolt_mapping(start, start + page_size, phys, + PAGE_KERNEL, mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(mapped < 0); +} +#endif /* CONFIG_PPC_BOOK3E */ + int __meminit vmemmap_populate(struct page *start_page, unsigned long nr_pages, int node) { @@ -215,8 +256,11 @@ int __meminit vmemmap_populate(struct page *start_page, /* Align to the page size of the linear mapping. */ start = _ALIGN_DOWN(start, page_size); + pr_debug("vmemmap_populate page %p, %ld pages, node %d\n", + start_page, nr_pages, node); + pr_debug(" -> map %lx..%lx\n", start, end); + for (; start < end; start += page_size) { - int mapped; void *p; if (vmemmap_populated(start, page_size)) @@ -226,13 +270,10 @@ int __meminit vmemmap_populate(struct page *start_page, if (!p) return -ENOMEM; - pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n", - start, p, __pa(p)); + pr_debug(" * %016lx..%016lx allocated at %p\n", + start, start + page_size, p); - mapped = htab_bolt_mapping(start, start + page_size, __pa(p), - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - BUG_ON(mapped < 0); + vmemmap_create_mapping(start, page_size, __pa(p)); } return 0; diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index b1a727def15..c2f93dc470e 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -25,10 +25,20 @@ * also clear mm->cpu_vm_mask bits when processes are migrated */ -#undef DEBUG -#define DEBUG_STEAL_ONLY -#undef DEBUG_MAP_CONSISTENCY -/*#define DEBUG_CLAMP_LAST_CONTEXT 15 */ +#define DEBUG_MAP_CONSISTENCY +#define DEBUG_CLAMP_LAST_CONTEXT 31 +//#define DEBUG_HARDER + +/* We don't use DEBUG because it tends to be compiled in always nowadays + * and this would generate way too much output + */ +#ifdef DEBUG_HARDER +#define pr_hard(args...) printk(KERN_DEBUG args) +#define pr_hardcont(args...) printk(KERN_CONT args) +#else +#define pr_hard(args...) do { } while(0) +#define pr_hardcont(args...) do { } while(0) +#endif #include <linux/kernel.h> #include <linux/mm.h> @@ -71,7 +81,7 @@ static DEFINE_SPINLOCK(context_lock); static unsigned int steal_context_smp(unsigned int id) { struct mm_struct *mm; - unsigned int cpu, max; + unsigned int cpu, max, i; max = last_context - first_context; @@ -89,15 +99,22 @@ static unsigned int steal_context_smp(unsigned int id) id = first_context; continue; } - pr_devel("[%d] steal context %d from mm @%p\n", - smp_processor_id(), id, mm); + pr_hardcont(" | steal %d from 0x%p", id, mm); /* Mark this mm has having no context anymore */ mm->context.id = MMU_NO_CONTEXT; - /* Mark it stale on all CPUs that used this mm */ - for_each_cpu(cpu, mm_cpumask(mm)) - __set_bit(id, stale_map[cpu]); + /* Mark it stale on all CPUs that used this mm. For threaded + * implementations, we set it on all threads on each core + * represented in the mask. A future implementation will use + * a core map instead but this will do for now. + */ + for_each_cpu(cpu, mm_cpumask(mm)) { + for (i = cpu_first_thread_in_core(cpu); + i <= cpu_last_thread_in_core(cpu); i++) + __set_bit(id, stale_map[i]); + cpu = i - 1; + } return id; } @@ -126,7 +143,7 @@ static unsigned int steal_context_up(unsigned int id) /* Pick up the victim mm */ mm = context_mm[id]; - pr_devel("[%d] steal context %d from mm @%p\n", cpu, id, mm); + pr_hardcont(" | steal %d from 0x%p", id, mm); /* Flush the TLB for that context */ local_flush_tlb_mm(mm); @@ -173,25 +190,20 @@ static void context_check_map(void) { } void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) { - unsigned int id, cpu = smp_processor_id(); + unsigned int i, id, cpu = smp_processor_id(); unsigned long *map; /* No lockless fast path .. yet */ spin_lock(&context_lock); -#ifndef DEBUG_STEAL_ONLY - pr_devel("[%d] activating context for mm @%p, active=%d, id=%d\n", - cpu, next, next->context.active, next->context.id); -#endif + pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", + cpu, next, next->context.active, next->context.id); #ifdef CONFIG_SMP /* Mark us active and the previous one not anymore */ next->context.active++; if (prev) { -#ifndef DEBUG_STEAL_ONLY - pr_devel(" old context %p active was: %d\n", - prev, prev->context.active); -#endif + pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active); WARN_ON(prev->context.active < 1); prev->context.active--; } @@ -201,8 +213,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* If we already have a valid assigned context, skip all that */ id = next->context.id; - if (likely(id != MMU_NO_CONTEXT)) + if (likely(id != MMU_NO_CONTEXT)) { +#ifdef DEBUG_MAP_CONSISTENCY + if (context_mm[id] != next) + pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n", + next, id, id, context_mm[id]); +#endif goto ctxt_ok; + } /* We really don't have a context, let's try to acquire one */ id = next_context; @@ -235,11 +253,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) next_context = id + 1; context_mm[id] = next; next->context.id = id; - -#ifndef DEBUG_STEAL_ONLY - pr_devel("[%d] picked up new id %d, nrf is now %d\n", - cpu, id, nr_free_contexts); -#endif + pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts); context_check_map(); ctxt_ok: @@ -248,15 +262,21 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) * local TLB for it and unmark it before we use it */ if (test_bit(id, stale_map[cpu])) { - pr_devel("[%d] flushing stale context %d for mm @%p !\n", - cpu, id, next); + pr_hardcont(" | stale flush %d [%d..%d]", + id, cpu_first_thread_in_core(cpu), + cpu_last_thread_in_core(cpu)); + local_flush_tlb_mm(next); /* XXX This clear should ultimately be part of local_flush_tlb_mm */ - __clear_bit(id, stale_map[cpu]); + for (i = cpu_first_thread_in_core(cpu); + i <= cpu_last_thread_in_core(cpu); i++) { + __clear_bit(id, stale_map[i]); + } } /* Flick the MMU and release lock */ + pr_hardcont(" -> %d\n", id); set_context(id, next->pgd); spin_unlock(&context_lock); } @@ -266,6 +286,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) */ int init_new_context(struct task_struct *t, struct mm_struct *mm) { + pr_hard("initing context for mm @%p\n", mm); + mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; @@ -305,7 +327,9 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; - +#ifdef CONFIG_HOTPLUG_CPU + struct task_struct *p; +#endif /* We don't touch CPU 0 map, it's allocated at aboot and kept * around forever */ @@ -324,8 +348,16 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); kfree(stale_map[cpu]); stale_map[cpu] = NULL; - break; -#endif + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + read_lock(&tasklist_lock); + for_each_process(p) { + if (p->mm) + cpu_mask_clear_cpu(cpu, mm_cpumask(p->mm)); + } + read_unlock(&tasklist_lock); + break; +#endif /* CONFIG_HOTPLUG_CPU */ } return NOTIFY_OK; } diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d1f9c62dc17..d2e5321d5ea 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -36,21 +36,37 @@ static inline void _tlbil_pid(unsigned int pid) { asm volatile ("sync; tlbia; isync" : : : "memory"); } +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) + #else /* CONFIG_40x || CONFIG_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbil_pid_noind(unsigned int pid); +#else +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) +#endif #endif /* !(CONFIG_40x || CONFIG_8xx) */ /* * On 8xx, we directly inline tlbie, on others, it's extern */ #ifdef CONFIG_8xx -static inline void _tlbil_va(unsigned long address, unsigned int pid) +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); } -#else /* CONFIG_8xx */ -extern void _tlbil_va(unsigned long address, unsigned int pid); +#elif defined(CONFIG_PPC_BOOK3E) +extern void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else +extern void __tlbil_va(unsigned long address, unsigned int pid); +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + __tlbil_va(address, pid); +} #endif /* CONIFG_8xx */ /* @@ -58,10 +74,16 @@ extern void _tlbil_va(unsigned long address, unsigned int pid); * implementation. When that becomes the case, this will be * an extern. */ -static inline void _tlbivax_bcast(unsigned long address, unsigned int pid) +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else +static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) { BUG(); } +#endif #else /* CONFIG_PPC_MMU_NOHASH */ @@ -99,7 +121,12 @@ extern unsigned int rtas_data, rtas_size; struct hash_pte; extern struct hash_pte *Hash, *Hash_end; extern unsigned long Hash_size, Hash_mask; -#endif + +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC64 +extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags); +#endif /* CONFIG_PPC64 */ extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 627767d6169..83f1551ec2c 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -30,6 +30,16 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + +#ifdef CONFIG_SMP + +/* + * Handle batching of page table freeing on SMP. Page tables are + * queued up and send to be freed later by RCU in order to avoid + * freeing a page table page that is being walked without locks + */ + static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); static unsigned long pte_freelist_forced_free; @@ -116,27 +126,7 @@ void pte_free_finish(void) *batchp = NULL; } -/* - * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags() - */ -static pte_t do_dcache_icache_coherency(pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - struct page *page; - - if (unlikely(!pfn_valid(pfn))) - return pte; - page = pfn_to_page(pfn); - - if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) { - pr_devel("do_dcache_icache_coherency... flushing\n"); - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } - else - pr_devel("do_dcache_icache_coherency... already clean\n"); - return __pte(pte_val(pte) | _PAGE_HWEXEC); -} +#endif /* CONFIG_SMP */ static inline int is_exec_fault(void) { @@ -145,49 +135,139 @@ static inline int is_exec_fault(void) /* We only try to do i/d cache coherency on stuff that looks like * reasonably "normal" PTEs. We currently require a PTE to be present - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE + * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * on userspace PTEs */ static inline int pte_looks_normal(pte_t pte) { return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) == - (_PAGE_PRESENT); + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); } -#if defined(CONFIG_PPC_STD_MMU) +struct page * maybe_pte_to_page(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + struct page *page; + + if (unlikely(!pfn_valid(pfn))) + return NULL; + page = pfn_to_page(pfn); + if (PageReserved(page)) + return NULL; + return page; +} + +#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 + /* Server-style MMU handles coherency when hashing if HW exec permission - * is supposed per page (currently 64-bit only). Else, we always flush - * valid PTEs in set_pte. + * is supposed per page (currently 64-bit only). If not, then, we always + * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec + * support falls into the same category. */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) + +static pte_t set_pte_filter(pte_t pte) { - return set_pte && pte_looks_normal(pte) && - !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || - cpu_has_feature(CPU_FTR_NOEXECUTE)); + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); + if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { + struct page *pg = maybe_pte_to_page(pte); + if (!pg) + return pte; + if (!test_bit(PG_arch_1, &pg->flags)) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + } + } + return pte; } -#elif _PAGE_HWEXEC == 0 -/* Embedded type MMU without HW exec support (8xx only so far), we flush - * the cache for any present PTE - */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) { - return set_pte && pte_looks_normal(pte); + return pte; } -#else -/* Other embedded CPUs with HW exec support per-page, we flush on exec - * fault if HWEXEC is not set + +#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ + +/* Embedded type MMU with HW exec support. This is a bit more complicated + * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so + * instead we "filter out" the exec permission for non clean pages. */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) +static pte_t set_pte_filter(pte_t pte) { - return pte_looks_normal(pte) && is_exec_fault() && - !(pte_val(pte) & _PAGE_HWEXEC); + struct page *pg; + + /* No exec permission in the first place, move on */ + if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + return pte; + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + return pte; + + /* If the page clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + return pte; + + /* If it's an exec fault, we flush the cache and make it clean */ + if (is_exec_fault()) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + return pte; + } + + /* Else, we filter out _PAGE_EXEC */ + return __pte(pte_val(pte) & ~_PAGE_EXEC); } -#endif + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) +{ + struct page *pg; + + /* So here, we only care about exec faults, as we use them + * to recover lost _PAGE_EXEC and perform I$/D$ coherency + * if necessary. Also if _PAGE_EXEC is already set, same deal, + * we just bail out + */ + if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + return pte; + +#ifdef CONFIG_DEBUG_VM + /* So this is an exec fault, _PAGE_EXEC is not set. If it was + * an error we would have bailed out earlier in do_page_fault() + * but let's make sure of it + */ + if (WARN_ON(!(vma->vm_flags & VM_EXEC))) + return pte; +#endif /* CONFIG_DEBUG_VM */ + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + goto bail; + + /* If the page is already clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + goto bail; + + /* Clean the page and set PG_arch_1 */ + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + + bail: + return __pte(pte_val(pte) | _PAGE_EXEC); +} + +#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte) { #ifdef CONFIG_DEBUG_VM WARN_ON(pte_present(*ptep)); @@ -196,9 +276,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte * this context might not have been activated yet when this * is called. */ - pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - if (pte_need_exec_flush(pte, 1)) - pte = do_dcache_icache_coherency(pte); + pte = set_pte_filter(pte); /* Perform the setting of the PTE */ __set_pte_at(mm, addr, ptep, pte, 0); @@ -215,8 +293,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed; - if (!dirty && pte_need_exec_flush(entry, 0)) - entry = do_dcache_icache_coherency(entry); + entry = set_access_flags_filter(entry, vma, dirty); changed = !pte_same(*(ptep), entry); if (changed) { if (!(vma->vm_flags & VM_HUGETLB)) @@ -242,7 +319,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, addr); BUG_ON(!pmd_present(*pmd)); - BUG_ON(!spin_is_locked(pte_lockptr(mm, pmd))); + assert_spin_locked(pte_lockptr(mm, pmd)); } #endif /* CONFIG_DEBUG_VM */ diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 5422169626b..cb96cb2e17c 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -142,7 +142,7 @@ ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags) flags |= _PAGE_DIRTY | _PAGE_HWWRITE; /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC); + flags &= ~(_PAGE_USER | _PAGE_EXEC); return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index bfa7db6b2fd..853d5565eed 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -33,6 +33,8 @@ #include <linux/stddef.h> #include <linux/vmalloc.h> #include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/lmb.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -55,19 +57,36 @@ unsigned long ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PPC_MMU_NOHASH +static void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + if (init_bootmem_done) + pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS)); + else + pt = __va(lmb_alloc_base(size, size, + __pa(MAX_DMA_ADDRESS))); + memset(pt, 0, size); + + return pt; +} +#endif /* CONFIG_PPC_MMU_NOHASH */ + /* - * map_io_page currently only called by __ioremap - * map_io_page adds an entry to the ioremap page table + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -static int map_io_page(unsigned long ea, unsigned long pa, int flags) +int map_kernel_page(unsigned long ea, unsigned long pa, int flags) { pgd_t *pgdp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; - if (mem_init_done) { + if (slab_is_available()) { pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) @@ -81,6 +100,35 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); } else { +#ifdef CONFIG_PPC_MMU_NOHASH + /* Warning ! This will blow up if bootmem is not initialized + * which our ppc64 code is keen to do that, we'll need to + * fix it and/or be more careful + */ + pgdp = pgd_offset_k(ea); +#ifdef PUD_TABLE_SIZE + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* PUD_TABLE_SIZE */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); +#else /* CONFIG_PPC_MMU_NOHASH */ /* * If the mm subsystem is not fully up, we cannot create a * linux page table entry for this mapping. Simply bolt an @@ -93,6 +141,7 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) "memory at %016lx !\n", pa); return -ENOMEM; } +#endif /* !CONFIG_PPC_MMU_NOHASH */ } return 0; } @@ -124,7 +173,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, WARN_ON(size & ~PAGE_MASK); for (i = 0; i < size; i += PAGE_SIZE) - if (map_io_page((unsigned long)ea+i, pa+i, flags)) + if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) return NULL; return (void __iomem *)ea; diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index a685652effe..1d98ecc8eec 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -191,7 +191,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) unsigned long slbie_data = 0; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); - unsigned long unmapped_base; + unsigned long exec_base; /* * We need interrupts hard-disabled here, not just soft-disabled, @@ -227,42 +227,44 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) /* * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. */ - if (test_tsk_thread_flag(tsk, TIF_32BIT)) - unmapped_base = TASK_UNMAPPED_BASE_USER32; - else - unmapped_base = TASK_UNMAPPED_BASE_USER64; + exec_base = 0x10000000; - if (is_kernel_addr(pc)) - return; - slb_allocate(pc); - - if (esids_match(pc,stack)) + if (is_kern |