aboutsummaryrefslogtreecommitdiff
path: root/arch/tile/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/tile/mm')
-rw-r--r--arch/tile/mm/elf.c114
-rw-r--r--arch/tile/mm/fault.c256
-rw-r--r--arch/tile/mm/highmem.c8
-rw-r--r--arch/tile/mm/homecache.c185
-rw-r--r--arch/tile/mm/hugetlbpage.c397
-rw-r--r--arch/tile/mm/init.c293
-rw-r--r--arch/tile/mm/migrate.h6
-rw-r--r--arch/tile/mm/migrate_32.S41
-rw-r--r--arch/tile/mm/migrate_64.S167
-rw-r--r--arch/tile/mm/mmap.c26
-rw-r--r--arch/tile/mm/pgtable.c295
11 files changed, 994 insertions, 794 deletions
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 55e58e93bfc..23f044e8a7a 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -21,6 +21,8 @@
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/sections.h>
+#include <asm/vdso.h>
+#include <arch/sim.h>
/* Notify a running simulator, if any, that an exec just occurred. */
static void sim_notify_exec(const char *binary_name)
@@ -35,28 +37,57 @@ static void sim_notify_exec(const char *binary_name)
} while (c);
}
-static int notify_exec(void)
+static int notify_exec(struct mm_struct *mm)
{
- int retval = 0; /* failure */
- struct vm_area_struct *vma = current->mm->mmap;
- while (vma) {
- if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
+ char *buf, *path;
+ struct vm_area_struct *vma;
+
+ if (!sim_is_simulator())
+ return 1;
+
+ if (mm->exe_file == NULL)
+ return 0;
+
+ for (vma = current->mm->mmap; ; vma = vma->vm_next) {
+ if (vma == NULL)
+ return 0;
+ if (vma->vm_file == mm->exe_file)
break;
- vma = vma->vm_next;
}
- if (vma) {
- char *buf = (char *) __get_free_page(GFP_KERNEL);
- if (buf) {
- char *path = d_path(&vma->vm_file->f_path,
- buf, PAGE_SIZE);
- if (!IS_ERR(path)) {
- sim_notify_exec(path);
- retval = 1;
- }
- free_page((unsigned long)buf);
+
+ buf = (char *) __get_free_page(GFP_KERNEL);
+ if (buf == NULL)
+ return 0;
+
+ path = d_path(&mm->exe_file->f_path, buf, PAGE_SIZE);
+ if (IS_ERR(path)) {
+ free_page((unsigned long)buf);
+ return 0;
+ }
+
+ /*
+ * Notify simulator of an ET_DYN object so we know the load address.
+ * The somewhat cryptic overuse of SIM_CONTROL_DLOPEN allows us
+ * to be backward-compatible with older simulator releases.
+ */
+ if (vma->vm_start == (ELF_ET_DYN_BASE & PAGE_MASK)) {
+ char buf[64];
+ int i;
+
+ snprintf(buf, sizeof(buf), "0x%lx:@", vma->vm_start);
+ for (i = 0; ; ++i) {
+ char c = buf[i];
+ __insn_mtspr(SPR_SIM_CONTROL,
+ (SIM_CONTROL_DLOPEN
+ | (c << _SIM_CONTROL_OPERATOR_BITS)));
+ if (c == '\0')
+ break;
}
}
- return retval;
+
+ sim_notify_exec(path);
+ free_page((unsigned long)buf);
+ return 1;
}
/* Notify a running simulator, if any, that we loaded an interpreter. */
@@ -72,63 +103,23 @@ static void sim_notify_interp(unsigned long load_addr)
}
-/* Kernel address of page used to map read-only kernel data into userspace. */
-static void *vdso_page;
-
-/* One-entry array used for install_special_mapping. */
-static struct page *vdso_pages[1];
-
-static int __init vdso_setup(void)
-{
- vdso_page = (void *)get_zeroed_page(GFP_ATOMIC);
- memcpy(vdso_page, __rt_sigreturn, __rt_sigreturn_end - __rt_sigreturn);
- vdso_pages[0] = virt_to_page(vdso_page);
- return 0;
-}
-device_initcall(vdso_setup);
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
- if (vma->vm_private_data == vdso_pages)
- return "[vdso]";
-#ifndef __tilegx__
- if (vma->vm_start == MEM_USER_INTRPT)
- return "[intrpt]";
-#endif
- return NULL;
-}
-
int arch_setup_additional_pages(struct linux_binprm *bprm,
int executable_stack)
{
struct mm_struct *mm = current->mm;
- unsigned long vdso_base;
int retval = 0;
+ down_write(&mm->mmap_sem);
+
/*
* Notify the simulator that an exec just occurred.
* If we can't find the filename of the mapping, just use
* whatever was passed as the linux_binprm filename.
*/
- if (!notify_exec())
+ if (!notify_exec(mm))
sim_notify_exec(bprm->filename);
- down_write(&mm->mmap_sem);
-
- /*
- * MAYWRITE to allow gdb to COW and set breakpoints
- *
- * Make sure the vDSO gets into every core dump. Dumping its
- * contents makes post-mortem fully interpretable later
- * without matching up the same kernel and hardware config to
- * see what PC values meant.
- */
- vdso_base = VDSO_BASE;
- retval = install_special_mapping(mm, vdso_base, PAGE_SIZE,
- VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
- VM_ALWAYSDUMP,
- vdso_pages);
+ retval = setup_vdso_pages();
#ifndef __tilegx__
/*
@@ -140,7 +131,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
if (!retval) {
unsigned long addr = MEM_USER_INTRPT;
addr = mmap_region(NULL, addr, INTRPT_SIZE,
- MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
if (addr > (unsigned long) -PAGE_SIZE)
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index f295b4ac941..6c0571216a9 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -24,7 +24,6 @@
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/tty.h>
@@ -35,8 +34,8 @@
#include <linux/hugetlb.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
+#include <linux/kdebug.h>
-#include <asm/system.h>
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/traps.h>
@@ -44,15 +43,18 @@
#include <arch/interrupts.h>
-static noinline void force_sig_info_fault(int si_signo, int si_code,
- unsigned long address, int fault_num, struct task_struct *tsk)
+static noinline void force_sig_info_fault(const char *type, int si_signo,
+ int si_code, unsigned long address,
+ int fault_num,
+ struct task_struct *tsk,
+ struct pt_regs *regs)
{
siginfo_t info;
if (unlikely(tsk->pid < 2)) {
panic("Signal %d (code %d) at %#lx sent to %s!",
si_signo, si_code & 0xffff, address,
- tsk->pid ? "init" : "the idle task");
+ is_idle_task(tsk) ? "the idle task" : "init");
}
info.si_signo = si_signo;
@@ -60,6 +62,7 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,
info.si_code = si_code;
info.si_addr = (void __user *)address;
info.si_trapno = fault_num;
+ trace_unhandled_signal(type, regs, address, si_signo);
force_sig_info(si_signo, &info, tsk);
}
@@ -68,15 +71,17 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,
* Synthesize the fault a PL0 process would get by doing a word-load of
* an unaligned address or a high kernel address.
*/
-SYSCALL_DEFINE2(cmpxchg_badaddr, unsigned long, address,
- struct pt_regs *, regs)
+SYSCALL_DEFINE1(cmpxchg_badaddr, unsigned long, address)
{
+ struct pt_regs *regs = current_pt_regs();
+
if (address >= PAGE_OFFSET)
- force_sig_info_fault(SIGSEGV, SEGV_MAPERR, address,
- INT_DTLB_MISS, current);
+ force_sig_info_fault("atomic segfault", SIGSEGV, SEGV_MAPERR,
+ address, INT_DTLB_MISS, current, regs);
else
- force_sig_info_fault(SIGBUS, BUS_ADRALN, address,
- INT_UNALIGN_DATA, current);
+ force_sig_info_fault("atomic alignment fault", SIGBUS,
+ BUS_ADRALN, address,
+ INT_UNALIGN_DATA, current, regs);
/*
* Adjust pc to point at the actual instruction, which is unusual
@@ -118,16 +123,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
pmd_k = pmd_offset(pud_k, address);
if (!pmd_present(*pmd_k))
return NULL;
- if (!pmd_present(*pmd)) {
+ if (!pmd_present(*pmd))
set_pmd(pmd, *pmd_k);
- arch_flush_lazy_mmu_mode();
- } else
+ else
BUG_ON(pmd_ptfn(*pmd) != pmd_ptfn(*pmd_k));
return pmd_k;
}
/*
- * Handle a fault on the vmalloc or module mapping area
+ * Handle a fault on the vmalloc area.
*/
static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)
{
@@ -145,8 +149,6 @@ static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)
pmd_k = vmalloc_sync_one(pgd, address);
if (!pmd_k)
return -1;
- if (pmd_huge(*pmd_k))
- return 0; /* support TILE huge_vmap() API */
pte_k = pte_offset_kernel(pmd_k, address);
if (!pte_present(*pte_k))
return -1;
@@ -184,7 +186,7 @@ static pgd_t *get_current_pgd(void)
HV_Context ctx = hv_inquire_context();
unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT;
struct page *pgd_page = pfn_to_page(pgd_pfn);
- BUG_ON(PageHighMem(pgd_page)); /* oops, HIGHPTE? */
+ BUG_ON(PageHighMem(pgd_page));
return (pgd_t *) __va(ctx.page_table);
}
@@ -200,9 +202,14 @@ static pgd_t *get_current_pgd(void)
* interrupt or a critical region, and must do as little as possible.
* Similarly, we can't use atomic ops here, since we may be handling a
* fault caused by an atomic op access.
+ *
+ * If we find a migrating PTE while we're in an NMI context, and we're
+ * at a PC that has a registered exception handler, we don't wait,
+ * since this thread may (e.g.) have been interrupted while migrating
+ * its own stack, which would then cause us to self-deadlock.
*/
static int handle_migrating_pte(pgd_t *pgd, int fault_num,
- unsigned long address,
+ unsigned long address, unsigned long pc,
int is_kernel_mode, int write)
{
pud_t *pud;
@@ -224,6 +231,8 @@ static int handle_migrating_pte(pgd_t *pgd, int fault_num,
pte_offset_kernel(pmd, address);
pteval = *pte;
if (pte_migrating(pteval)) {
+ if (in_nmi() && search_exception_tables(pc))
+ return 0;
wait_for_migration(pte);
return 1;
}
@@ -263,12 +272,15 @@ static int handle_page_fault(struct pt_regs *regs,
int si_code;
int is_kernel_mode;
pgd_t *pgd;
+ unsigned int flags;
/* on TILE, protection faults are always writes */
if (!is_page_fault)
write = 1;
- is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+ flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ is_kernel_mode = !user_mode(regs);
tsk = validate_current();
@@ -291,13 +303,13 @@ static int handle_page_fault(struct pt_regs *regs,
/*
* Early on, we need to check for migrating PTE entries;
* see homecache.c. If we find a migrating PTE, we wait until
- * the backing page claims to be done migrating, then we procede.
+ * the backing page claims to be done migrating, then we proceed.
* For kernel PTEs, we rewrite the PTE and return and retry.
* Otherwise, we treat the fault like a normal "no PTE" fault,
* rather than trying to patch up the existing PTE.
*/
pgd = get_current_pgd();
- if (handle_migrating_pte(pgd, fault_num, address,
+ if (handle_migrating_pte(pgd, fault_num, address, regs->pc,
is_kernel_mode, write))
return 1;
@@ -332,9 +344,12 @@ static int handle_page_fault(struct pt_regs *regs,
/*
* If we're trying to touch user-space addresses, we must
* be either at PL0, or else with interrupts enabled in the
- * kernel, so either way we can re-enable interrupts here.
+ * kernel, so either way we can re-enable interrupts here
+ * unless we are doing atomic access to user space with
+ * interrupts disabled.
*/
- local_irq_enable();
+ if (!(regs->flags & PT_FLAGS_DISABLE_IRQ))
+ local_irq_enable();
mm = tsk->mm;
@@ -347,6 +362,9 @@ static int handle_page_fault(struct pt_regs *regs,
goto bad_area_nosemaphore;
}
+ if (!is_kernel_mode)
+ flags |= FAULT_FLAG_USER;
+
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
@@ -369,6 +387,8 @@ static int handle_page_fault(struct pt_regs *regs,
vma = NULL; /* happy compiler */
goto bad_area_nosemaphore;
}
+
+retry:
down_read(&mm->mmap_sem);
}
@@ -405,18 +425,22 @@ good_area:
#endif
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
+ flags |= FAULT_FLAG_WRITE;
} else {
if (!is_page_fault || !(vma->vm_flags & VM_READ))
goto bad_area;
}
- survive:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, flags);
+
+ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
+ return 0;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
@@ -424,33 +448,33 @@ good_area:
goto do_sigbus;
BUG();
}
- if (fault & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ if (fault & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ if (fault & VM_FAULT_RETRY) {
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+
+ /*
+ * No need to up_read(&mm->mmap_sem) as we would
+ * have already released it in __lock_page_or_retry
+ * in mm/filemap.c.
+ */
+ goto retry;
+ }
+ }
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
- /*
- * If this was an asynchronous fault,
- * restart the appropriate engine.
- */
- switch (fault_num) {
#if CHIP_HAS_TILE_DMA()
+ /* If this was a DMA TLB fault, restart the DMA engine. */
+ switch (fault_num) {
case INT_DMATLB_MISS:
case INT_DMATLB_MISS_DWNCL:
case INT_DMATLB_ACCESS:
case INT_DMATLB_ACCESS_DWNCL:
__insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);
break;
-#endif
-#if CHIP_HAS_SN_PROC()
- case INT_SNITLB_MISS:
- case INT_SNITLB_MISS_DWNCL:
- __insn_mtspr(SPR_SNCTL,
- __insn_mfspr(SPR_SNCTL) &
- ~SPR_SNCTL__FRZPROC_MASK);
- break;
-#endif
}
#endif
@@ -472,8 +496,8 @@ bad_area_nosemaphore:
*/
local_irq_enable();
- force_sig_info_fault(SIGSEGV, si_code, address,
- fault_num, tsk);
+ force_sig_info_fault("segfault", SIGSEGV, si_code, address,
+ fault_num, tsk, regs);
return 0;
}
@@ -511,7 +535,7 @@ no_context:
if (unlikely(tsk->pid < 2)) {
panic("Kernel page fault running %s!",
- tsk->pid ? "init" : "the idle task");
+ is_idle_task(tsk) ? "the idle task" : "init");
}
/*
@@ -531,15 +555,10 @@ no_context:
*/
out_of_memory:
up_read(&mm->mmap_sem);
- if (is_global_init(tsk)) {
- yield();
- down_read(&mm->mmap_sem);
- goto survive;
- }
- pr_alert("VM: killing process %s\n", tsk->comm);
- if (!is_kernel_mode)
- do_group_exit(SIGKILL);
- goto no_context;
+ if (is_kernel_mode)
+ goto no_context;
+ pagefault_out_of_memory();
+ return 0;
do_sigbus:
up_read(&mm->mmap_sem);
@@ -548,7 +567,8 @@ do_sigbus:
if (is_kernel_mode)
goto no_context;
- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, fault_num, tsk);
+ force_sig_info_fault("bus error", SIGBUS, BUS_ADRERR, address,
+ fault_num, tsk, regs);
return 0;
}
@@ -656,20 +676,12 @@ struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,
}
/*
- * NOTE: the one other type of access that might bring us here
- * are the memory ops in __tns_atomic_acquire/__tns_atomic_release,
- * but we don't have to check specially for them since we can
- * always safely return to the address of the fault and retry,
- * since no separate atomic locks are involved.
- */
-
- /*
* Now that we have released the atomic lock (if necessary),
* it's safe to spin if the PTE that caused the fault was migrating.
*/
if (fault_num == INT_DTLB_ACCESS)
write = 1;
- if (handle_migrating_pte(pgd, fault_num, address, 1, write))
+ if (handle_migrating_pte(pgd, fault_num, address, pc, 1, write))
return state;
/* Return zero so that we continue on with normal fault handling. */
@@ -692,8 +704,60 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
{
int is_page_fault;
+#ifdef CONFIG_KPROBES
+ /*
+ * This is to notify the fault handler of the kprobes. The
+ * exception code is redundant as it is also carried in REGS,
+ * but we pass it anyhow.
+ */
+ if (notify_die(DIE_PAGE_FAULT, "page fault", regs, -1,
+ regs->faultnum, SIGSEGV) == NOTIFY_STOP)
+ return;
+#endif
+
+#ifdef __tilegx__
+ /*
+ * We don't need early do_page_fault_ics() support, since unlike
+ * Pro we don't need to worry about unlocking the atomic locks.
+ * There is only one current case in GX where we touch any memory
+ * under ICS other than our own kernel stack, and we handle that
+ * here. (If we crash due to trying to touch our own stack,
+ * we're in too much trouble for C code to help out anyway.)
+ */
+ if (write & ~1) {
+ unsigned long pc = write & ~1;
+ if (pc >= (unsigned long) __start_unalign_asm_code &&
+ pc < (unsigned long) __end_unalign_asm_code) {
+ struct thread_info *ti = current_thread_info();
+ /*
+ * Our EX_CONTEXT is still what it was from the
+ * initial unalign exception, but now we've faulted
+ * on the JIT page. We would like to complete the
+ * page fault however is appropriate, and then retry
+ * the instruction that caused the unalign exception.
+ * Our state has been "corrupted" by setting the low
+ * bit in "sp", and stashing r0..r3 in the
+ * thread_info area, so we revert all of that, then
+ * continue as if this were a normal page fault.
+ */
+ regs->sp &= ~1UL;
+ regs->regs[0] = ti->unalign_jit_tmp[0];
+ regs->regs[1] = ti->unalign_jit_tmp[1];
+ regs->regs[2] = ti->unalign_jit_tmp[2];
+ regs->regs[3] = ti->unalign_jit_tmp[3];
+ write &= 1;
+ } else {
+ pr_alert("%s/%d: ICS set at page fault at %#lx: %#lx\n",
+ current->comm, current->pid, pc, address);
+ show_regs(regs);
+ do_group_exit(SIGKILL);
+ return;
+ }
+ }
+#else
/* This case should have been handled by do_page_fault_ics(). */
BUG_ON(write & ~1);
+#endif
#if CHIP_HAS_TILE_DMA()
/*
@@ -722,10 +786,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
case INT_DMATLB_MISS:
case INT_DMATLB_MISS_DWNCL:
#endif
-#if CHIP_HAS_SN_PROC()
- case INT_SNITLB_MISS:
- case INT_SNITLB_MISS_DWNCL:
-#endif
is_page_fault = 1;
break;
@@ -741,7 +801,8 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
panic("Bad fault number %d in do_page_fault", fault_num);
}
- if (EX1_PL(regs->ex1) != USER_PL) {
+#if CHIP_HAS_TILE_DMA()
+ if (!user_mode(regs)) {
struct async_tlb *async;
switch (fault_num) {
#if CHIP_HAS_TILE_DMA()
@@ -752,12 +813,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
async = &current->thread.dma_async_tlb;
break;
#endif
-#if CHIP_HAS_SN_PROC()
- case INT_SNITLB_MISS:
- case INT_SNITLB_MISS_DWNCL:
- async = &current->thread.sn_async_tlb;
- break;
-#endif
default:
async = NULL;
}
@@ -784,19 +839,28 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
return;
}
}
+#endif
handle_page_fault(regs, fault_num, is_page_fault, address, write);
}
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
/*
- * Check an async_tlb structure to see if a deferred fault is waiting,
- * and if so pass it to the page-fault code.
+ * This routine effectively re-issues asynchronous page faults
+ * when we are returning to user space.
*/
-static void handle_async_page_fault(struct pt_regs *regs,
- struct async_tlb *async)
+void do_async_page_fault(struct pt_regs *regs)
{
+ struct async_tlb *async = &current->thread.dma_async_tlb;
+
+ /*
+ * Clear thread flag early. If we re-interrupt while processing
+ * code here, we will reset it and recall this routine before
+ * returning to user space.
+ */
+ clear_thread_flag(TIF_ASYNC_TLB);
+
if (async->fault_num) {
/*
* Clear async->fault_num before calling the page-fault
@@ -810,35 +874,15 @@ static void handle_async_page_fault(struct pt_regs *regs,
async->address, async->is_write);
}
}
-#endif /* CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() */
-
-
-/*
- * This routine effectively re-issues asynchronous page faults
- * when we are returning to user space.
- */
-void do_async_page_fault(struct pt_regs *regs)
-{
- /*
- * Clear thread flag early. If we re-interrupt while processing
- * code here, we will reset it and recall this routine before
- * returning to user space.
- */
- clear_thread_flag(TIF_ASYNC_TLB);
+#endif /* CHIP_HAS_TILE_DMA() */
-#if CHIP_HAS_TILE_DMA()
- handle_async_page_fault(regs, &current->thread.dma_async_tlb);
-#endif
-#if CHIP_HAS_SN_PROC()
- handle_async_page_fault(regs, &current->thread.sn_async_tlb);
-#endif
-}
void vmalloc_sync_all(void)
{
#ifdef __tilegx__
/* Currently all L1 kernel pmd's are static and shared. */
- BUG_ON(pgd_index(VMALLOC_END) != pgd_index(VMALLOC_START));
+ BUILD_BUG_ON(pgd_index(VMALLOC_END - PAGE_SIZE) !=
+ pgd_index(VMALLOC_START));
#else
/*
* Note that races in the updates of insync and start aren't
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 31dbbd9afe4..0dc21829477 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -93,7 +93,7 @@ static DEFINE_PER_CPU(struct kmap_amps, amps);
* If we examine it earlier we are exposed to a race where it looks
* writable earlier, but becomes immutable before we write the PTE.
*/
-static void kmap_atomic_register(struct page *page, enum km_type type,
+static void kmap_atomic_register(struct page *page, int type,
unsigned long va, pte_t *ptep, pte_t pteval)
{
unsigned long flags;
@@ -114,7 +114,6 @@ static void kmap_atomic_register(struct page *page, enum km_type type,
list_add(&amp->list, &amp_list);
set_pte(ptep, pteval);
- arch_flush_lazy_mmu_mode();
spin_unlock(&amp_lock);
homecache_kpte_unlock(flags);
@@ -224,12 +223,12 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
}
EXPORT_SYMBOL(kmap_atomic_prot);
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
{
/* PAGE_NONE is a magic value that tells us to check immutability. */
return kmap_atomic_prot(page, PAGE_NONE);
}
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
void __kunmap_atomic(void *kvaddr)
{
@@ -259,7 +258,6 @@ void __kunmap_atomic(void *kvaddr)
BUG_ON(vaddr >= (unsigned long)high_memory);
}
- arch_flush_lazy_mmu_mode();
pagefault_enable();
}
EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index d78df3a6ee1..33294fdc402 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -30,6 +30,7 @@
#include <linux/cache.h>
#include <linux/smp.h>
#include <linux/module.h>
+#include <linux/hugetlb.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -42,12 +43,9 @@
#include "migrate.h"
-#if CHIP_HAS_COHERENT_LOCAL_CACHE()
-
/*
* The noallocl2 option suppresses all use of the L2 cache to cache
- * locally from a remote home. There's no point in using it if we
- * don't have coherent local caching, though.
+ * locally from a remote home.
*/
static int __write_once noallocl2;
static int __init set_noallocl2(char *str)
@@ -57,16 +55,6 @@ static int __init set_noallocl2(char *str)
}
early_param("noallocl2", set_noallocl2);
-#else
-
-#define noallocl2 0
-
-#endif
-
-/* Provide no-op versions of these routines to keep flush_remote() cleaner. */
-#define mark_caches_evicted_start() 0
-#define mark_caches_evicted_finish(mask, timestamp) do {} while (0)
-
/*
* Update the irq_stat for cpus that we are going to interrupt
@@ -106,7 +94,6 @@ static void hv_flush_update(const struct cpumask *cache_cpumask,
* there's never any good reason for hv_flush_remote() to fail.
* - Accepts a 32-bit PFN rather than a 64-bit PA, which generally
* is the type that Linux wants to pass around anyway.
- * - Centralizes the mark_caches_evicted() handling.
* - Canonicalizes that lengths of zero make cpumasks NULL.
* - Handles deferring TLB flushes for dataplane tiles.
* - Tracks remote interrupts in the per-cpu irq_cpustat_t.
@@ -125,7 +112,6 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
HV_Remote_ASID *asids, int asidcount)
{
int rc;
- int timestamp = 0; /* happy compiler */
struct cpumask cache_cpumask_copy, tlb_cpumask_copy;
struct cpumask *cache_cpumask, *tlb_cpumask;
HV_PhysAddr cache_pa;
@@ -156,15 +142,11 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
hv_flush_update(cache_cpumask, tlb_cpumask, tlb_va, tlb_length,
asids, asidcount);
cache_pa = (HV_PhysAddr)cache_pfn << PAGE_SHIFT;
- if (cache_control & HV_FLUSH_EVICT_L2)
- timestamp = mark_caches_evicted_start();
rc = hv_flush_remote(cache_pa, cache_control,
cpumask_bits(cache_cpumask),
tlb_va, tlb_length, tlb_pgsize,
cpumask_bits(tlb_cpumask),
asids, asidcount);
- if (cache_control & HV_FLUSH_EVICT_L2)
- mark_caches_evicted_finish(cache_cpumask, timestamp);
if (rc == 0)
return;
cpumask_scnprintf(cache_buf, sizeof(cache_buf), &cache_cpumask_copy);
@@ -179,59 +161,88 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
panic("Unsafe to continue.");
}
-void homecache_evict(const struct cpumask *mask)
+static void homecache_finv_page_va(void* va, int home)
{
- flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
+ int cpu = get_cpu();
+ if (home == cpu) {
+ finv_buffer_local(va, PAGE_SIZE);
+ } else if (home == PAGE_HOME_HASH) {
+ finv_buffer_remote(va, PAGE_SIZE, 1);
+ } else {
+ BUG_ON(home < 0 || home >= NR_CPUS);
+ finv_buffer_remote(va, PAGE_SIZE, 0);
+ }
+ put_cpu();
}
-/* Return a mask of the cpus whose caches currently own these pages. */
-static void homecache_mask(struct page *page, int pages,
- struct cpumask *home_mask)
+void homecache_finv_map_page(struct page *page, int home)
{
- int i;
- cpumask_clear(home_mask);
- for (i = 0; i < pages; ++i) {
- int home = page_home(&page[i]);
- if (home == PAGE_HOME_IMMUTABLE ||
- home == PAGE_HOME_INCOHERENT) {
- cpumask_copy(home_mask, cpu_possible_mask);
- return;
- }
-#if CHIP_HAS_CBOX_HOME_MAP()
- if (home == PAGE_HOME_HASH) {
- cpumask_or(home_mask, home_mask, &hash_for_home_map);
- continue;
- }
+ unsigned long flags;
+ unsigned long va;
+ pte_t *ptep;
+ pte_t pte;
+
+ if (home == PAGE_HOME_UNCACHED)
+ return;
+ local_irq_save(flags);
+#ifdef CONFIG_HIGHMEM
+ va = __fix_to_virt(FIX_KMAP_BEGIN + kmap_atomic_idx_push() +
+ (KM_TYPE_NR * smp_processor_id()));
+#else
+ va = __fix_to_virt(FIX_HOMECACHE_BEGIN + smp_processor_id());
#endif
- if (home == PAGE_HOME_UNCACHED)
- continue;
- BUG_ON(home < 0 || home >= NR_CPUS);
- cpumask_set_cpu(home, home_mask);
- }
+ ptep = virt_to_kpte(va);
+ pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
+ __set_pte(ptep, pte_set_home(pte, home));
+ homecache_finv_page_va((void *)va, home);
+ __pte_clear(ptep);
+ hv_flush_page(va, PAGE_SIZE);
+#ifdef CONFIG_HIGHMEM
+ kmap_atomic_idx_pop();
+#endif
+ local_irq_restore(flags);
}
-/*
- * Return the passed length, or zero if it's long enough that we
- * believe we should evict the whole L2 cache.
- */
-static unsigned long cache_flush_length(unsigned long length)
+static void homecache_finv_page_home(struct page *page, int home)
+{
+ if (!PageHighMem(page) && home == page_home(page))
+ homecache_finv_page_va(page_address(page), home);
+ else
+ homecache_finv_map_page(page, home);
+}
+
+static inline bool incoherent_home(int home)
+{
+ return home == PAGE_HOME_IMMUTABLE || home == PAGE_HOME_INCOHERENT;
+}
+
+static void homecache_finv_page_internal(struct page *page, int force_map)
{
- return (length >= CHIP_L2_CACHE_SIZE()) ? HV_FLUSH_EVICT_L2 : length;
+ int home = page_home(page);
+ if (home == PAGE_HOME_UNCACHED)
+ return;
+ if (incoherent_home(home)) {
+ int cpu;
+ for_each_cpu(cpu, &cpu_cacheable_map)
+ homecache_finv_map_page(page, cpu);
+ } else if (force_map) {
+ /* Force if, e.g., the normal mapping is migrating. */
+ homecache_finv_map_page(page, home);
+ } else {
+ homecache_finv_page_home(page, home);
+ }
+ sim_validate_lines_evicted(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
}
-/* Flush a page out of whatever cache(s) it is in. */
-void homecache_flush_cache(struct page *page, int order)
+void homecache_finv_page(struct page *page)
{
- int pages = 1 << order;
- int length = cache_flush_length(pages * PAGE_SIZE);
- unsigned long pfn = page_to_pfn(page);
- struct cpumask home_mask;
-
- homecache_mask(page, pages, &home_mask);
- flush_remote(pfn, length, &home_mask, 0, 0, 0, NULL, NULL, 0);
- sim_validate_lines_evicted(PFN_PHYS(pfn), pages * PAGE_SIZE);
+ homecache_finv_page_internal(page, 0);
}
+void homecache_evict(const struct cpumask *mask)
+{
+ flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
+}
/* Report the home corresponding to a given PTE. */
static int pte_to_home(pte_t pte)
@@ -245,10 +256,8 @@ static int pte_to_home(pte_t pte)
return PAGE_HOME_INCOHERENT;
case HV_PTE_MODE_UNCACHED:
return PAGE_HOME_UNCACHED;
-#if CHIP_HAS_CBOX_HOME_MAP()
case HV_PTE_MODE_CACHE_HASH_L3:
return PAGE_HOME_HASH;
-#endif
}
panic("Bad PTE %#llx\n", pte.val);
}
@@ -305,20 +314,16 @@ pte_t pte_set_home(pte_t pte, int home)
HV_PTE_MODE_CACHE_NO_L3);
}
} else
-#if CHIP_HAS_CBOX_HOME_MAP()
if (hash_default)
pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
else
-#endif
pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
pte = hv_pte_set_nc(pte);
break;
-#if CHIP_HAS_CBOX_HOME_MAP()
case PAGE_HOME_HASH:
pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
break;
-#endif
default:
BUG_ON(home < 0 || home >= NR_CPUS ||
@@ -328,7 +333,6 @@ pte_t pte_set_home(pte_t pte, int home)
break;
}
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
if (noallocl2)
pte = hv_pte_set_no_alloc_l2(pte);
@@ -337,7 +341,6 @@ pte_t pte_set_home(pte_t pte, int home)
hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_NO_L3) {
pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);
}
-#endif
/* Checking this case here gives a better panic than from the hv. */
BUG_ON(hv_pte_get_mode(pte) == 0);
@@ -353,21 +356,16 @@ EXPORT_SYMBOL(pte_set_home);
* so they're not suitable for anything but infrequent use.
*/
-#if CHIP_HAS_CBOX_HOME_MAP()
-static inline int initial_page_home(void) { return PAGE_HOME_HASH; }
-#else
-static inline int initial_page_home(void) { return 0; }
-#endif
-
int page_home(struct page *page)
{
if (PageHighMem(page)) {
- return initial_page_home();
+ return PAGE_HOME_HASH;
} else {
unsigned long kva = (unsigned long)page_address(page);
- return pte_to_home(*virt_to_pte(NULL, kva));
+ return pte_to_home(*virt_to_kpte(kva));
}
}
+EXPORT_SYMBOL(page_home);
void homecache_change_page_home(struct page *page, int order, int home)
{
@@ -383,12 +381,13 @@ void homecache_change_page_home(struct page *page, int order, int home)
NULL, 0);
for (i = 0; i < pages; ++i, kva += PAGE_SIZE) {
- pte_t *ptep = virt_to_pte(NULL, kva);
+ pte_t *ptep = virt_to_kpte(kva);
pte_t pteval = *ptep;
BUG_ON(!pte_present(pteval) || pte_huge(pteval));
- *ptep = pte_set_home(pteval, home);
+ __set_pte(ptep, pte_set_home(pteval, home));
}
}
+EXPORT_SYMBOL(homecache_change_page_home);
struct page *homecache_alloc_pages(gfp_t gfp_mask,
unsigned int order, int home)
@@ -413,19 +412,25 @@ struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,
return page;
}
-void homecache_free_pages(unsigned long addr, unsigned int order)
+void __homecache_free_pages(struct page *page, unsigned int order)
{
- struct page *page;
-
- if (addr == 0)
- return;
-
- VM_BUG_ON(!virt_addr_valid((void *)addr));
- page = virt_to_page((void *)addr);
if (put_page_testzero(page)) {
- int pages = (1 << order);
- homecache_change_page_home(page, order, initial_page_home());
- while (pages--)
- __free_page(page++);
+ homecache_change_page_home(page, order, PAGE_HOME_HASH);
+ if (order == 0) {
+ free_hot_cold_page(page, false);
+ } else {
+ init_page_count(page);
+ __free_pages(page, order);
+ }
+ }
+}
+EXPORT_SYMBOL(__homecache_free_pages);
+
+void homecache_free_pages(unsigned long addr, unsigned int order)
+{
+ if (addr != 0) {
+ VM_BUG_ON(!virt_addr_valid((void *)addr));
+ __homecache_free_pages(virt_to_page((void *)addr), order);
}
}
+EXPORT_SYMBOL(homecache_free_pages);
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 24688b697a8..e514899e110 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -21,92 +21,135 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/sysctl.h>
#include <linux/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+
+/*
+ * Provide an additional huge page size (in addition to the regular default
+ * huge page size) if no "hugepagesz" arguments are specified.
+ * Note that it must be smaller than the default huge page size so
+ * that it's possible to allocate them on demand from the buddy allocator.
+ * You can change this to 64K (on a 16K build), 256K, 1M, or 4M,
+ * or not define it at all.
+ */
+#define ADDITIONAL_HUGE_SIZE (1024 * 1024UL)
+
+/* "Extra" page-size multipliers, one per level of the page table. */
+int huge_shift[HUGE_SHIFT_ENTRIES] = {
+#ifdef ADDITIONAL_HUGE_SIZE
+#define ADDITIONAL_HUGE_SHIFT __builtin_ctzl(ADDITIONAL_HUGE_SIZE / PAGE_SIZE)
+ [HUGE_SHIFT_PAGE] = ADDITIONAL_HUGE_SHIFT
+#endif
+};
+
+#endif
pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
pud_t *pud;
- pte_t *pte = NULL;
- /* We do not yet support multiple huge page sizes. */
- BUG_ON(sz != PMD_SIZE);
+ addr &= -sz; /* Mask off any low bits in the address. */
pgd = pgd_offset(mm, addr);
pud = pud_alloc(mm, pgd, addr);
- if (pud)
- pte = (pte_t *) pmd_alloc(mm, pud, addr);
- BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
- return pte;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+ if (sz >= PGDIR_SIZE) {
+ BUG_ON(sz != PGDIR_SIZE &&
+ sz != PGDIR_SIZE << huge_shift[HUGE_SHIFT_PGDIR]);
+ return (pte_t *)pud;
+ } else {
+ pmd_t *pmd = pmd_alloc(mm, pud, addr);
+ if (sz >= PMD_SIZE) {
+ BUG_ON(sz != PMD_SIZE &&
+ sz != (PMD_SIZE << huge_shift[HUGE_SHIFT_PMD]));
+ return (pte_t *)pmd;
+ }
+ else {
+ if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE])
+ panic("Unexpected page size %#lx\n", sz);
+ return pte_alloc_map(mm, NULL, pmd, addr);
+ }
+ }
+#else
+ BUG_ON(sz != PMD_SIZE);
+ return (pte_t *) pmd_alloc(mm, pud, addr);
+#endif
}
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+static pte_t *get_pte(pte_t *base, int index, int level)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd = NULL;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_present(*pgd)) {
- pud = pud_offset(pgd, addr);
- if (pud_present(*pud))
- pmd = pmd_offset(pud, addr);
+ pte_t *ptep = base + index;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+ if (!pte_present(*ptep) && huge_shift[level] != 0) {
+ unsigned long mask = -1UL << huge_shift[level];
+ pte_t *super_ptep = base + (index & mask);
+ pte_t pte = *super_ptep;
+ if (pte_present(pte) && pte_super(pte))
+ ptep = super_ptep;
}
- return (pte_t *) pmd;
+#endif
+ return ptep;
}
-#ifdef HUGETLB_TEST
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
- int write)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
- unsigned long start = address;
- int length = 1;
- int nr;
- struct page *page;
- struct vm_area_struct *vma;
-
- vma = find_vma(mm, addr);
- if (!vma || !is_vm_hugetlb_page(vma))
- return ERR_PTR(-EINVAL);
-
- pte = huge_pte_offset(mm, address);
-
- /* hugetlb should be locked, and hence, prefaulted */
- WARN_ON(!pte || pte_none(*pte));
-
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
- WARN_ON(!PageHead(page));
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+ pte_t *pte;
+#endif
- return page;
-}
+ /* Get the top-level page table entry. */
+ pgd = (pgd_t *)get_pte((pte_t *)mm->pgd, pgd_index(addr), 0);
-int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
+ /* We don't have four levels. */
+ pud = pud_offset(pgd, addr);
+#ifndef __PAGETABLE_PUD_FOLDED
+# error support fourth page table level
+#endif
+ if (!pud_present(*pud))
+ return NULL;
+
+ /* Check for an L0 huge PTE, if we have three levels. */
+#ifndef __PAGETABLE_PMD_FOLDED
+ if (pud_huge(*pud))
+ return (pte_t *)pud;
+
+ pmd = (pmd_t *)get_pte((pte_t *)pud_page_vaddr(*pud),
+ pmd_index(addr), 1);
+ if (!pmd_present(*pmd))
+ return NULL;
+#else
+ pmd = pmd_offset(pud, addr);
+#endif
-int pud_huge(pud_t pud)
-{
- return 0;
-}
+ /* Check for an L1 huge PTE. */
+ if (pmd_huge(*pmd))
+ return (pte_t *)pmd;
+
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+ /* Check for an L2 huge PTE. */
+ pte = get_pte((pte_t *)pmd_page_vaddr(*pmd), pte_index(addr), 2);
+ if (!pte_present(*pte))
+ return NULL;
+ if (pte_super(*pte))
+ return pte;
+#endif
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
-{
return NULL;
}
-#else
-
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
int write)
{
@@ -150,50 +193,21 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
return 0;
}
-#endif
-
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long start_addr;
-
- if (len > mm->cached_hole_size) {
- start_addr = mm->free_area_cache;
- } else {
- start_addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- }
-
-full_search:
- addr = ALIGN(start_addr, huge_page_size(h));
-
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != TASK_UNMAPPED_BASE) {
- start_addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = ALIGN(vma->vm_end, huge_page_size(h));
- }
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
}
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -201,92 +215,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev_vma;
- unsigned long base = mm->mmap_base, addr = addr0;
- unsigned long largest_hole = mm->cached_hole_size;
- int first_time = 1;
-
- /* don't allow allocations above current base */
- if (mm->free_area_cache > base)
- mm->free_area_cache = base;
-
- if (len <= largest_hole) {
- largest_hole = 0;
- mm->free_area_cache = base;
- }
-try_again:
- /* make sure it can fit in the remaining address space */
- if (mm->free_area_cache < len)
- goto fail;
-
- /* either no address requested or cant fit in requested address hole */
- addr = (mm->free_area_cache - len) & huge_page_mask(h);
- do {
- /*
- * Lookup failure means no vma is above this address,
- * i.e. return with success:
- */
- vma = find_vma_prev(mm, addr, &prev_vma);
- if (!vma) {
- return addr;
- break;
- }
+ struct vm_unmapped_area_info info;
+ unsigned long addr;
- /*
- * new region fits between prev_vma->vm_end and
- * vma->vm_start, use it:
- */
- if (addr + len <= vma->vm_start &&
- (!prev_vma || (addr >= prev_vma->vm_end))) {
- /* remember the address as a hint for next time */
- mm->cached_hole_size = largest_hole;
- mm->free_area_cache = addr;
- return addr;
- } else {
- /* pull free_area_cache down to the first hole */
- if (mm->free_area_cache == vma->vm_end) {
- mm->free_area_cache = vma->vm_start;
- mm->cached_hole_size = largest_hole;
- }
- }
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = current->mm->mmap_base;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
- /* remember the largest hole we saw so far */
- if (addr + largest_hole < vma->vm_start)
- largest_hole = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = (vma->vm_start - len) & huge_page_mask(h);
-
- } while (len <= vma->vm_start);
-
-fail:
- /*
- * if hint left us with no space for the requested
- * mapping then try again:
- */
- if (first_time) {
- mm->free_area_cache = base;
- largest_hole = 0;
- first_time = 0;
- goto try_again;
- }
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = ~0UL;
- addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
- len, pgoff, flags);
-
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = base;
- mm->cached_hole_size = ~0UL;
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
return addr;
}
@@ -323,21 +275,102 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
}
+#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
-static __init int setup_hugepagesz(char *opt)
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+static __init int __setup_hugepagesz(unsigned long ps)
{
- unsigned long ps = memparse(opt, &opt);
- if (ps == PMD_SIZE) {
- hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ int log_ps = __builtin_ctzl(ps);
+ int level, base_shift;
+
+ if ((1UL << log_ps) != ps || (log_ps & 1) != 0) {
+ pr_warn("Not enabling %ld byte huge pages;"
+ " must be a power of four.\n", ps);
+ return -EINVAL;
+ }
+
+ if (ps > 64*1024*1024*1024UL) {
+ pr_warn("Not enabling %ld MB huge pages;"
+ " largest legal value is 64 GB .\n", ps >> 20);
+ return -EINVAL;
+ } else if (ps >= PUD_SIZE) {
+ static long hv_jpage_size;
+ if (hv_jpage_size == 0)
+ hv_jpage_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO);
+ if (hv_jpage_size != PUD_SIZE) {
+ pr_warn("Not enabling >= %ld MB huge pages:"
+ " hypervisor reports size %ld\n",
+ PUD_SIZE >> 20, hv_jpage_size);
+ return -EINVAL;
+ }
+ level = 0;
+ base_shift = PUD_SHIFT;
+ } else if (ps >= PMD_SIZE) {
+ level = 1;
+ base_shift = PMD_SHIFT;
+ } else if (ps > PAGE_SIZE) {
+ level = 2;
+ base_shift = PAGE_SHIFT;
} else {
- pr_err("hugepagesz: Unsupported page size %lu M\n",
- ps >> 20);
- return 0;
+ pr_err("hugepagesz: huge page size %ld too small\n", ps);
+ return -EINVAL;
+ }
+
+ if (log_ps != base_shift) {
+ int shift_val = log_ps - base_shift;
+ if (huge_shift[level] != 0) {
+ int old_shift = base_shift + huge_shift[level];
+ pr_warn("Not enabling %ld MB huge pages;"
+ " already have size %ld MB.\n",
+ ps >> 20, (1UL << old_shift) >> 20);
+ return -EINVAL;
+ }
+ if (hv_set_pte_super_shift(level, shift_val) != 0) {
+ pr_warn("Not enabling %ld MB huge pages;"
+ " no hypervisor support.\n", ps >> 20);
+ return -EINVAL;
+ }
+ printk(KERN_DEBUG "Enabled %ld MB huge pages\n", ps >> 20);
+ huge_shift[level] = shift_val;
+ }
+
+ hugetlb_add_hstate(log_ps - PAGE_SHIFT);
+
+ return 0;
+}
+
+static bool saw_hugepagesz;
+
+static __init int setup_hugepagesz(char *opt)
+{
+ if (!saw_hugepagesz) {
+ saw_hugepagesz = true;
+ memset(huge_shift, 0, sizeof(huge_shift));
}
- return 1;
+ return __setup_hugepagesz(memparse(opt, NULL));
}
__setup("hugepagesz=", setup_hugepagesz);
-#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#ifdef ADDITIONAL_HUGE_SIZE
+/*
+ * Provide an additional huge page size if no "hugepagesz" args are given.
+ * In that case, all the cores have properly set up their hv super_shift
+ * already, but we need to notify the hugetlb code to enable the
+ * new huge page size from the Linux point of view.
+ */
+static __init int add_default_hugepagesz(void)
+{
+ if (!saw_hugepagesz) {
+ BUILD_BUG_ON(ADDITIONAL_HUGE_SIZE >= PMD_SIZE ||
+ ADDITIONAL_HUGE_SIZE <= PAGE_SIZE);
+ BUILD_BUG_ON((PAGE_SIZE << ADDITIONAL_HUGE_SHIFT) !=
+ ADDITIONAL_HUGE_SIZE);
+ BUILD_BUG_ON(ADDITIONAL_HUGE_SHIFT & 1);
+ hugetlb_add_hstate(ADDITIONAL_HUGE_SHIFT);
+ }
+ return 0;
+}
+arch_initcall(add_default_hugepagesz);
+#endif
+
+#endif /* CONFIG_HUGETLB_SUPER_PAGES */
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 0b9ce69b0ee..bfb3127b4df 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -38,7 +38,6 @@
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
@@ -53,26 +52,13 @@
#include "migrate.h"
-/*
- * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)"
- * in the Tile Kconfig, but this generates configure warnings.
- * Do it here and force people to get it right to compile this file.
- * The problem is that with 4KB small pages and 16MB huge pages,
- * the default value doesn't allow us to group enough small pages
- * together to make up a huge page.
- */
-#if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1
-# error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size"
-#endif
-
#define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))
#ifndef __tilegx__
unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE;
+EXPORT_SYMBOL(VMALLOC_RESERVE);
#endif
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
/* Create an L2 page table */
static pte_t * __init alloc_pte(void)
{
@@ -96,7 +82,7 @@ static int num_l2_ptes[MAX_NUMNODES];
static void init_prealloc_ptes(int node, int pages)
{
- BUG_ON(pages & (HV_L2_ENTRIES-1));
+ BUG_ON(pages & (PTRS_PER_PTE - 1));
if (pages) {
num_l2_ptes[node] = pages;
l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t),
@@ -120,10 +106,8 @@ pte_t *get_prealloc_pte(unsigned long pfn)
*/
static int initial_heap_home(void)
{
-#if CHIP_HAS_CBOX_HOME_MAP()
if (hash_default)
return PAGE_HOME_HASH;
-#endif
return smp_processor_id();
}
@@ -145,14 +129,9 @@ static void __init assign_pte(pmd_t *pmd, pte_t *page_table)
#ifdef __tilegx__
-#if HV_L1_SIZE != HV_L2_SIZE
-# error Rework assumption that L1 and L2 page tables are same size.
-#endif
-
-/* Since pmd_t arrays and pte_t arrays are the same size, just use casts. */
static inline pmd_t *alloc_pmd(void)
{
- return (pmd_t *)alloc_pte();
+ return __alloc_bootmem(L1_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0);
}
static inline void assign_pmd(pud_t *pud, pmd_t *pmd)
@@ -169,7 +148,21 @@ void __init shatter_pmd(pmd_t *pmd)
assign_pte(pmd, pte);
}
-#ifdef CONFIG_HIGHMEM
+#ifdef __tilegx__
+static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
+{
+ pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va);
+ if (pud_none(*pud))
+ assign_pmd(pud, alloc_pmd());
+ return pmd_offset(pud, va);
+}
+#else
+static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
+{
+ return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va);
+}
+#endif
+
/*
* This function initializes a certain range of kernel virtual memory
* with new bootmem page tables, everywhere page tables are missing in
@@ -182,34 +175,24 @@ void __init shatter_pmd(pmd_t *pmd)
* checking the pgd every time.
*/
static void __init page_table_range_init(unsigned long start,
- unsigned long end, pgd_t *pgd_base)
+ unsigned long end, pgd_t *pgd)
{
- pgd_t *pgd;
- int pgd_idx;
unsigned long vaddr;
-
- vaddr = start;
- pgd_idx = pgd_index(vaddr);
- pgd = pgd_base + pgd_idx;
-
- for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
- pmd_t *pmd = pmd_offset(pud_offset(pgd, vaddr), vaddr);
+ start = round_down(start, PMD_SIZE);
+ end = round_up(end, PMD_SIZE);
+ for (vaddr = start; vaddr < end; vaddr += PMD_SIZE) {
+ pmd_t *pmd = get_pmd(pgd, vaddr);
if (pmd_none(*pmd))
assign_pte(pmd, alloc_pte());
- vaddr += PMD_SIZE;
}
}
-#endif /* CONFIG_HIGHMEM */
-
-#if CHIP_HAS_CBOX_HOME_MAP()
static int __initdata ktext_hash = 1; /* .text pages */
static int __initdata kdata_hash = 1; /* .data and .bss pages */
int __write_once hash_default = 1; /* kernel allocator pages */
EXPORT_SYMBOL(hash_default);
int __write_once kstack_hash = 1; /* if no homecaching, use h4h */
-#endif /* CHIP_HAS_CBOX_HOME_MAP */
/*
* CPUs to use to for striping the pages of kernel data. If hash-for-home
@@ -227,14 +210,12 @@ int __write_once kdata_huge; /* if no homecaching, small pages */
static pgprot_t __init construct_pgprot(pgprot_t prot, int home)
{
prot = pte_set_home(prot, home);
-#if CHIP_HAS_CBOX_HOME_MAP()
if (home == PAGE_HOME_IMMUTABLE) {
if (ktext_hash)
prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3);
else
prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3);
}
-#endif
return prot;
}
@@ -246,40 +227,28 @@ static pgprot_t __init init_pgprot(ulong address)
{
int cpu;
unsigned long page;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
-#if CHIP_HAS_CBOX_HOME_MAP()
/* For kdata=huge, everything is just hash-for-home. */
if (kdata_huge)
return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
/* We map the aliased pages of permanent text inaccessible. */
if (address < (ulong) _sinittext - CODE_DELTA)
return PAGE_NONE;
- /*
- * We map read-only data non-coherent for performance. We could
- * use neighborhood caching on TILE64, but it's not clear it's a win.
- */
+ /* We map read-only data non-coherent for performance. */
if ((address >= (ulong) __start_rodata &&
address < (ulong) __end_rodata) ||
address == (ulong) empty_zero_page) {
return construct_pgprot(PAGE_KERNEL_RO, PAGE_HOME_IMMUTABLE);
}
- /* As a performance optimization, keep the boot init stack here. */
- if (address >= (ulong)&init_thread_union &&
- address < (ulong)&init_thread_union + THREAD_SIZE)
- return construct_pgprot(PAGE_KERNEL, smp_processor_id());
-
#ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
/* Force the atomic_locks[] array page to be hash-for-home. */
if (address == (ulong) atomic_locks)
return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
#endif
-#endif
/*
* Everything else that isn't data or bss is heap, so mark it
@@ -297,28 +266,18 @@ static pgprot_t __init init_pgprot(ulong address)
if (address >= (ulong) _end || address < (ulong) _einitdata)
return construct_pgprot(PAGE_KERNEL, initial_heap_home());
-#if CHIP_HAS_CBOX_HOME_MAP()
/* Use hash-for-home if requested for data/bss. */
if (kdata_hash)
return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
-
- /*
- * Make the w1data homed like heap to start with, to avoid
- * making it part of the page-striped data area when we're just
- * going to convert it to read-only soon anyway.
- */
- if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end)
- return construct_pgprot(PAGE_KERNEL, initial_heap_home());
/*
* Otherwise we just hand out consecutive cpus. To avoid
* requiring this function to hold state, we just walk forward from
- * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach
- * the requested address, while walking cpu home around kdata_mask.
- * This is typically no more than a dozen or so iterations.
+ * __end_rodata by PAGE_SIZE, skipping the readonly and init data, to
+ * reach the requested address, while walking cpu home around
+ * kdata_mask. This is typically no more than a dozen or so iterations.
*/
- page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK;
+ page = (((ulong)__end_rodata) + PAGE_SIZE - 1) & PAGE_MASK;
BUG_ON(address < page || address >= (ulong)_end);
cpu = cpumask_first(&kdata_mask);
for (; page < address; page += PAGE_SIZE) {
@@ -328,11 +287,9 @@ static pgprot_t __init init_pgprot(ulong address)
if (page == (ulong)empty_zero_page)
continue;
#ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
if (page == (ulong)atomic_locks)
continue;
#endif
-#endif
cpu = cpumask_next(cpu, &kdata_mask);
if (cpu == NR_CPUS)
cpu = cpumask_first(&kdata_mask);
@@ -375,7 +332,7 @@ static int __init setup_ktext(char *str)
ktext_arg_seen = 1;
- /* Default setting on Tile64: use a huge page */
+ /* Default setting: use a huge page */
if (strcmp(str, "huge") == 0)
pr_info("ktext: using one huge locally cached page\n");
@@ -421,31 +378,14 @@ static inline pgprot_t ktext_set_nocache(pgprot_t prot)
{
if (!ktext_nocache)
prot = hv_pte_set_nc(prot);
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
else
prot = hv_pte_set_no_alloc_l2(prot);
-#endif
return prot;
}
-#ifndef __tilegx__
-static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
-{
- return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va);
-}
-#else
-static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
-{
- pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va);
- if (pud_none(*pud))
- assign_pmd(pud, alloc_pmd());
- return pmd_offset(pud, va);
-}
-#endif
-
/* Temporary page table we use for staging. */
static pgd_t pgtables[PTRS_PER_PGD]
- __attribute__((section(".init.page")));
+ __attribute__((aligned(HV_PAGE_TABLE_ALIGN)));
/*
* This maps the physical memory to kernel virtual address space, a total
@@ -463,6 +403,7 @@ static pgd_t pgtables[PTRS_PER_PGD]
*/
static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
{
+ unsigned long long irqmask;
unsigned long address, pfn;
pmd_t *pmd;
pte_t *pte;
@@ -471,7 +412,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
struct cpumask kstripe_mask;
int rc, i;
-#if CHIP_HAS_CBOX_HOME_MAP()
if (ktext_arg_seen && ktext_hash) {
pr_warning("warning: \"ktext\" boot argument ignored"
" if \"kcache_hash\" sets up text hash-for-home\n");
@@ -488,7 +428,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
" kcache_hash=all or =allbutstack\n");
kdata_huge = 0;
}
-#endif
/*
* Set up a mask for cpus to use for kernel striping.
@@ -569,8 +508,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
}
}
- address = MEM_SV_INTRPT;
+ address = MEM_SV_START;
pmd = get_pmd(pgtables, address);
+ pfn = 0; /* code starts at PA 0 */
if (ktext_small) {
/* Allocate an L2 PTE for the kernel text */
int cpu = 0;
@@ -592,11 +532,16 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
prot = ktext_set_nocache(prot);
}
- BUG_ON(address != (unsigned long)_stext);
- pfn = 0; /* code starts at PA 0 */
- pte = alloc_pte();
- for (pte_ofs = 0; address < (unsigned long)_einittext;
- pfn++, pte_ofs++, address += PAGE_SIZE) {
+ BUG_ON(address != (unsigned long)_text);
+ pte = NULL;
+ for (; address < (unsigned long)_einittext;
+ pfn++, address += PAGE_SIZE) {
+ pte_ofs = pte_index(address);
+ if (pte_ofs == 0) {
+ if (pte)
+ assign_pte(pmd++, pte);
+ pte = alloc_pte();
+ }
if (!ktext_local) {
prot = set_remote_cache_cpu(prot, cpu);
cpu = cpumask_next(cpu, &ktext_mask);
@@ -605,17 +550,16 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
}
pte[pte_ofs] = pfn_pte(pfn, prot);
}
- assign_pte(pmd, pte);
+ if (pte)
+ assign_pte(pmd, pte);
} else {
pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC);
pteval = pte_mkhuge(pteval);
-#if CHIP_HAS_CBOX_HOME_MAP()
if (ktext_hash) {
pteval = hv_pte_set_mode(pteval,
HV_PTE_MODE_CACHE_HASH_L3);
pteval = ktext_set_nocache(pteval);
} else
-#endif /* CHIP_HAS_CBOX_HOME_MAP() */
if (cpumask_weight(&ktext_mask) == 1) {
pteval = set_remote_cache_cpu(pteval,
cpumask_first(&ktext_mask));
@@ -628,7 +572,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
else
pteval = hv_pte_set_mode(pteval,
HV_PTE_MODE_CACHE_NO_L3);
- *(pte_t *)pmd = pteval;
+ for (; address < (unsigned long)_einittext;
+ pfn += PFN_DOWN(HPAGE_SIZE), address += HPAGE_SIZE)
+ *(pte_t *)(pmd++) = pfn_pte(pfn, pteval);
}
/* Set swapper_pgprot here so it is flushed to memory right away. */
@@ -643,16 +589,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
* - install pgtables[] as the real page table
* - flush the TLB so the new page table takes effect
*/
+ irqmask = interrupt_mask_save_mask();
+ interrupt_mask_set_mask(-1ULL);
rc = flush_and_install_context(__pa(pgtables),
init_pgprot((unsigned long)pgtables),
__get_cpu_var(current_asid),
cpumask_bits(my_cpu_mask));
+ interrupt_mask_restore_mask(irqmask);
BUG_ON(rc != 0);
/* Copy the page table back to the normal swapper_pg_dir. */
memcpy(pgd_base, pgtables, sizeof(pgtables));
__install_page_table(pgd_base, __get_cpu_var(current_asid),
swapper_pgprot);
+
+ /*
+ * We just read swapper_pgprot and thus brought it into the cache,
+ * with its new home & caching mode. When we start the other CPUs,
+ * they're going to reference swapper_pgprot via their initial fake
+ * VA-is-PA mappings, which cache everything locally. At that
+ * time, if it's in our cache with a conflicting home, the
+ * simulator's coherence checker will complain. So, flush it out
+ * of our cache; we're not going to ever use it again anyway.
+ */
+ __insn_finv(&swapper_pgprot);
}
/*
@@ -698,6 +658,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
#endif /* CONFIG_HIGHMEM */
+#ifndef CONFIG_64BIT
static void __init init_free_pfn_range(unsigned long start, unsigned long end)
{
unsigned long pfn;
@@ -727,7 +688,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
}
init_page_count(page);
__free_pages(page, order);
- totalram_pages += count;
+ adjust_managed_page_count(page, count);
page += count;
pfn += count;
@@ -740,16 +701,15 @@ static void __init set_non_bootmem_pages_init(void)
for_each_zone(z) {
unsigned long start, end;
int nid = z->zone_pgdat->node_id;
+#ifdef CONFIG_HIGHMEM
int idx = zone_idx(z);
+#endif
start = z->zone_start_pfn;
- if (start == 0)
- continue; /* bootmem */
end = start + z->spanned_pages;
- if (idx == ZONE_NORMAL) {
- BUG_ON(start != node_start_pfn[nid]);
- start = node_free_pfn[nid];
- }
+ start = max(start, node_free_pfn[nid]);
+ start = max(start, max_low_pfn);
+
#ifdef CONFIG_HIGHMEM
if (idx == ZONE_HIGHMEM)
totalhigh_pages += z->spanned_pages;
@@ -770,6 +730,7 @@ static void __init set_non_bootmem_pages_init(void)
init_free_pfn_range(start, end);
}
}
+#endif
/*
* paging_init() sets up the page tables - note that all of lowmem is
@@ -777,9 +738,6 @@ static void __init set_non_bootmem_pages_init(void)
*/
void __init paging_init(void)
{
-#ifdef CONFIG_HIGHMEM
- unsigned long vaddr, end;
-#endif
#ifdef __tilegx__
pud_t *pud;
#endif
@@ -787,14 +745,11 @@ void __init paging_init(void)
kernel_physical_mapping_init(pgd_base);
+ /* Fixed mappings, only the page table structure has to be created. */
+ page_table_range_init(fix_to_virt(__end_of_fixed_addresses - 1),
+ FIXADDR_TOP, pgd_base);
+
#ifdef CONFIG_HIGHMEM
- /*
- * Fixed mappings, only the page table structure has to be
- * created - mappings will be set by set_fixmap():
- */
- vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
- end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
- page_table_range_init(vaddr, end, pgd_base);
permanent_kmaps_init(pgd_base);
#endif
@@ -806,7 +761,7 @@ void __init paging_init(void)
* changing init_mm once we get up and running, and there's no
* need for e.g. vmalloc_sync_all().
*/
- BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END));
+ BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END - 1));
pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START);
assign_pmd(pud, alloc_pmd());
#endif
@@ -831,15 +786,13 @@ static void __init set_max_mapnr_init(void)
void __init mem_init(void)
{
- int codesize, datasize, initsize;
int i;
#ifndef __tilegx__
void *last;
#endif
#ifdef CONFIG_FLATMEM
- if (!mem_map)
- BUG();
+ BUG_ON(!mem_map);
#endif
#ifdef CONFIG_HIGHMEM
@@ -857,24 +810,14 @@ void __init mem_init(void)
set_max_mapnr_init();
/* this will put all bootmem onto the freelists */
- totalram_pages += free_all_bootmem();
+ free_all_bootmem();
+#ifndef CONFIG_64BIT
/* count all remaining LOWMEM and give all HIGHMEM to page allocator */
set_non_bootmem_pages_init();
+#endif
- codesize = (unsigned long)&_etext - (unsigned long)&_text;
- datasize = (unsigned long)&_end - (unsigned long)&_sdata;
- initsize = (unsigned long)&_einittext - (unsigned long)&_sinittext;
- initsize += (unsigned long)&_einitdata - (unsigned long)&_sinitdata;
-
- pr_info("Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n",
- (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
- num_physpages << (PAGE_SHIFT-10),
- codesize >> 10,
- datasize >> 10,
- initsize >> 10,
- (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
- );
+ mem_init_print_info(NULL);
/*
* In debug mode, dump some interesting memory mappings.
@@ -885,10 +828,6 @@ void __init mem_init(void)
printk(KERN_DEBUG " PKMAP %#lx - %#lx\n",
PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP) - 1);
#endif
-#ifdef CONFIG_HUGEVMAP
- printk(KERN_DEBUG " HUGEMAP %#lx - %#lx\n",
- HUGE_VMAP_BASE, HUGE_VMAP_END - 1);
-#endif
printk(KERN_DEBUG " VMALLOC %#lx - %#lx\n",
_VMALLOC_START, _VMALLOC_END - 1);
#ifdef __tilegx__
@@ -944,41 +883,25 @@ int remove_memory(u64 start, u64 size)
{
return -EINVAL;
}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+ /* TODO */
+ return -EBUSY;
+}
+#endif
#endif
struct kmem_cache *pgd_cache;
void __init pgtable_cache_init(void)
{
- pgd_cache = kmem_cache_create("pgd",
- PTRS_PER_PGD*sizeof(pgd_t),
- PTRS_PER_PGD*sizeof(pgd_t),
- 0,
- NULL);
+ pgd_cache = kmem_cache_create("pgd", SIZEOF_PGD, SIZEOF_PGD, 0, NULL);
if (!pgd_cache)
panic("pgtable_cache_init(): Cannot create pgd cache");
}
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-/*
- * The __w1data area holds data that is only written during initialization,
- * and is read-only and thus freely cacheable thereafter. Fix the page
- * table entries that cover that region accordingly.
- */
-static void mark_w1data_ro(void)
-{
- /* Loop over page table entries */
- unsigned long addr = (unsigned long)__w1data_begin;
- BUG_ON((addr & (PAGE_SIZE-1)) != 0);
- for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) {
- unsigned long pfn = kaddr_to_pfn((void *)addr);
- pte_t *ptep = virt_to_pte(NULL, addr);
- BUG_ON(pte_huge(*ptep)); /* not relevant for kdata_huge */
- set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO));
- }
-}
-#endif
-
#ifdef CONFIG_DEBUG_PAGEALLOC
static long __write_once initfree;
#else
@@ -989,7 +912,7 @@ static long __write_once initfree = 1;
static int __init set_initfree(char *str)
{
long val;
- if (strict_strtol(str, 0, &val)) {
+ if (kstrtol(str, 0, &val) == 0) {
initfree = val;
pr_info("initfree: %s free init pages\n",
initfree ? "will" : "won't");
@@ -1018,7 +941,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
*/
int pfn = kaddr_to_pfn((void *)addr);
struct page *page = pfn_to_page(pfn);
- pte_t *ptep = virt_to_pte(NULL, addr);
+ pte_t *ptep = virt_to_kpte(addr);
if (!initfree) {
/*
* If debugging page accesses then do not free
@@ -1029,31 +952,24 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
pte_clear(&init_mm, addr, ptep);
continue;
}
- __ClearPageReserved(page);
- init_page_count(page);
if (pte_huge(*ptep))
BUG_ON(!kdata_huge);
else
set_pte_at(&init_mm, addr, ptep,
pfn_pte(pfn, PAGE_KERNEL));
memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- free_page(addr);
- totalram_pages++;
+ free_reserved_page(page);
}
pr_info("Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
}
void free_initmem(void)
{
- const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+ const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
/*
- * Evict the dirty initdata on the boot cpu, evict the w1data
- * wherever it's homed, and evict all the init code everywhere.
- * We are guaranteed that no one will touch the init pages any
- * more, and although other cpus may be touching the w1data,
- * we only actually change the caching on tile64, which won't
- * be keeping local copies in the other tiles' caches anyway.
+ * Evict the cache on all cores to avoid incoherence.
+ * We are guaranteed that no one will touch the init pages any more.
*/
homecache_evict(&cpu_cacheable_map);
@@ -1064,26 +980,11 @@ void free_initmem(void)
/*
* Free the pages mapped from 0xc0000000 that correspond to code
- * pages from MEM_SV_INTRPT that we won't use again after init.
+ * pages from MEM_SV_START that we won't use again after init.
*/
free_init_pages("unused kernel text",
(unsigned long)_sinittext - text_delta,
(unsigned long)_einittext - text_delta);
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
- /*
- * Upgrade the .w1data section to globally cached.
- * We don't do this on tilepro, since the cache architecture
- * pretty much makes it irrelevant, and in any case we end
- * up having racing issues with other tiles that may touch
- * the data after we flush the cache but before we update
- * the PTEs and flush the TLBs, causing sharer shootdowns
- * later. Even though this is to clean data, it seems like
- * an unnecessary complication.
- */
- mark_w1data_ro();
-#endif
-
/* Do a global TLB flush so everyone sees the changes. */
flush_tlb_all();
}
diff --git a/arch/tile/mm/migrate.h b/arch/tile/mm/migrate.h
index cd45a0837fa..91683d97917 100644
--- a/arch/tile/mm/migrate.h
+++ b/arch/tile/mm/migrate.h
@@ -24,6 +24,9 @@
/*
* This function is used as a helper when setting up the initial
* page table (swapper_pg_dir).
+ *
+ * You must mask ALL interrupts prior to invoking this code, since
+ * you can't legally touch the stack during the cache flush.
*/
extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
HV_ASID asid,
@@ -39,6 +42,9 @@ extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
*
* Note that any non-NULL pointers must not point to the page that
* is handled by the stack_pte itself.
+ *
+ * You must mask ALL interrupts prior to invoking this code, since
+ * you can't legally touch the stack during the cache flush.
*/
extern int homecache_migrate_stack_and_flush(pte_t stack_pte, unsigned long va,
size_t length, pte_t *stack_ptep,
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S
index f738765cd1e..772085491bf 100644
--- a/arch/tile/mm/migrate_32.S
+++ b/arch/tile/mm/migrate_32.S
@@ -18,6 +18,7 @@
#include <linux/linkage.h>
#include <linux/threads.h>
#include <asm/page.h>
+#include <asm/thread_info.h>
#include <asm/types.h>
#include <asm/asm-offsets.h>
#include <hv/hypervisor.h>
@@ -39,8 +40,7 @@
#define FRAME_R32 16
#define FRAME_R33 20
#define FRAME_R34 24
-#define FRAME_R35 28
-#define FRAME_SIZE 32
+#define FRAME_SIZE 28
@@ -65,12 +65,11 @@
#define r_my_cpumask r5
/* Locals (callee-save); must not be more than FRAME_xxx above. */
-#define r_save_ics r30
-#define r_context_lo r31
-#define r_context_hi r32
-#define r_access_lo r33
-#define r_access_hi r34
-#define r_asid r35
+#define r_context_lo r30
+#define r_context_hi r31
+#define r_access_lo r32
+#define r_access_hi r33
+#define r_asid r34
STD_ENTRY(flush_and_install_context)
/*
@@ -103,11 +102,7 @@ STD_ENTRY(flush_and_install_context)
sw r_tmp, r33
addi r_tmp, sp, FRAME_R34
}
- {
- sw r_tmp, r34
- addi r_tmp, sp, FRAME_R35
- }
- sw r_tmp, r35
+ sw r_tmp, r34
/* Move some arguments to callee-save registers. */
{
@@ -120,13 +115,6 @@ STD_ENTRY(flush_and_install_context)
}
move r_asid, r_asid_in
- /* Disable interrupts, since we can't use our stack. */
- {
- mfspr r_save_ics, INTERRUPT_CRITICAL_SECTION
- movei r_tmp, 1
- }
- mtspr INTERRUPT_CRITICAL_SECTION, r_tmp
-
/* First, flush our L2 cache. */
{
move r0, zero /* cache_pa */
@@ -148,7 +136,7 @@ STD_ENTRY(flush_and_install_context)
move r8, zero /* asids */
move r9, zero /* asidcount */
}
- jal hv_flush_remote
+ jal _hv_flush_remote
bnz r0, .Ldone
/* Now install the new page table. */
@@ -162,9 +150,9 @@ STD_ENTRY(flush_and_install_context)
}
{
move r4, r_asid
- movei r5, HV_CTX_DIRECTIO
+ moveli r5, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
}
- jal hv_install_context
+ jal _hv_install_context
bnz r0, .Ldone
/* Finally, flush the TLB. */
@@ -174,9 +162,6 @@ STD_ENTRY(flush_and_install_context)
}
.Ldone:
- /* Reset interrupts back how they were before. */
- mtspr INTERRUPT_CRITICAL_SECTION, r_save_ics
-
/* Restore the callee-saved registers and return. */
addli lr, sp, FRAME_SIZE
{
@@ -201,10 +186,6 @@ STD_ENTRY(flush_and_install_context)
}
{
lw r34, r_tmp
- addli r_tmp, sp, FRAME_R35
- }
- {
- lw r35, r_tmp
addi sp, sp, FRAME_SIZE
}
jrp lr
diff --git a/arch/tile/mm/migrate_64.S b/arch/tile/mm/migrate_64.S
new file mode 100644
index 00000000000..a49eee38f87
--- /dev/null
+++ b/arch/tile/mm/migrate_64.S
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ *
+ * This routine is a helper for migrating the home of a set of pages to
+ * a new cpu. See the documentation in homecache.c for more information.
+ */
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <asm/page.h>
+#include <asm/thread_info.h>
+#include <asm/types.h>
+#include <asm/asm-offsets.h>
+#include <hv/hypervisor.h>
+
+ .text
+
+/*
+ * First, some definitions that apply to all the code in the file.
+ */
+
+/* Locals (caller-save) */
+#define r_tmp r10
+#define r_save_sp r11
+
+/* What we save where in the stack frame; must include all callee-saves. */
+#define FRAME_SP 8
+#define FRAME_R30 16
+#define FRAME_R31 24
+#define FRAME_R32 32
+#define FRAME_SIZE 40
+
+
+
+
+/*
+ * On entry:
+ *
+ * r0 the new context PA to install (moved to r_context)
+ * r1 PTE to use for context access (moved to r_access)
+ * r2 ASID to use for new context (moved to r_asid)
+ * r3 pointer to cpumask with just this cpu set in it (r_my_cpumask)
+ */
+
+/* Arguments (caller-save) */
+#define r_context_in r0
+#define r_access_in r1
+#define r_asid_in r2
+#define r_my_cpumask r3
+
+/* Locals (callee-save); must not be more than FRAME_xxx above. */
+#define r_context r30
+#define r_access r31
+#define r_asid r32
+
+/*
+ * Caller-save locals and frame constants are the same as
+ * for homecache_migrate_stack_and_flush.
+ */
+
+STD_ENTRY(flush_and_install_context)
+ /*
+ * Create a stack frame; we can't touch it once we flush the
+ * cache until we install the new page table and flush the TLB.
+ */
+ {
+ move r_save_sp, sp
+ st sp, lr
+ addi sp, sp, -FRAME_SIZE
+ }
+ addi r_tmp, sp, FRAME_SP
+ {
+ st r_tmp, r_save_sp
+ addi r_tmp, sp, FRAME_R30
+ }
+ {
+ st r_tmp, r30
+ addi r_tmp, sp, FRAME_R31
+ }
+ {
+ st r_tmp, r31
+ addi r_tmp, sp, FRAME_R32
+ }
+ st r_tmp, r32
+
+ /* Move some arguments to callee-save registers. */
+ {
+ move r_context, r_context_in
+ move r_access, r_access_in
+ }
+ move r_asid, r_asid_in
+
+ /* First, flush our L2 cache. */
+ {
+ move r0, zero /* cache_pa */
+ moveli r1, hw2_last(HV_FLUSH_EVICT_L2) /* cache_control */
+ }
+ {
+ shl16insli r1, r1, hw1(HV_FLUSH_EVICT_L2)
+ move r2, r_my_cpumask /* cache_cpumask */
+ }
+ {
+ shl16insli r1, r1, hw0(HV_FLUSH_EVICT_L2)
+ move r3, zero /* tlb_va */
+ }
+ {
+ move r4, zero /* tlb_length */
+ move r5, zero /* tlb_pgsize */
+ }
+ {
+ move r6, zero /* tlb_cpumask */
+ move r7, zero /* asids */
+ }
+ {
+ move r8, zero /* asidcount */
+ jal _hv_flush_remote
+ }
+ bnez r0, 1f
+
+ /* Now install the new page table. */
+ {
+ move r0, r_context
+ move r1, r_access
+ }
+ {
+ move r2, r_asid
+ moveli r3, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
+ }
+ jal _hv_install_context
+ bnez r0, 1f
+
+ /* Finally, flush the TLB. */
+ {
+ movei r0, 0 /* preserve_global */
+ jal hv_flush_all
+ }
+
+1: /* Restore the callee-saved registers and return. */
+ addli lr, sp, FRAME_SIZE
+ {
+ ld lr, lr
+ addli r_tmp, sp, FRAME_R30
+ }
+ {
+ ld r30, r_tmp
+ addli r_tmp, sp, FRAME_R31
+ }
+ {
+ ld r31, r_tmp
+ addli r_tmp, sp, FRAME_R32
+ }
+ {
+ ld r32, r_tmp
+ addi sp, sp, FRAME_SIZE
+ }
+ jrp lr
+ STD_ENDPROC(flush_and_install_context)
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index f96f4cec602..851a94e6ae5 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -58,18 +58,36 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
#else
int is_32bit = 0;
#endif
+ unsigned long random_factor = 0UL;
+
+ /*
+ * 8 bits of randomness in 32bit mmaps, 24 address space bits
+ * 12 bits of randomness in 64bit mmaps, 28 address space bits
+ */
+ if (current->flags & PF_RANDOMIZE) {
+ if (is_32bit)
+ random_factor = get_random_int() % (1<<8);
+ else
+ random_factor = get_random_int() % (1<<12);
+
+ random_factor <<= PAGE_SHIFT;
+ }
/*
* Use standard layout if the expected stack growth is unlimited
* or we are running native 64 bits.
*/
- if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
- mm->mmap_base = TASK_UNMAPPED_BASE;
+ if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
+ mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(mm);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- mm->unmap_area = arch_unmap_area_topdown;
}
}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ unsigned long range_end = mm->brk + 0x02000000;
+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 1f5430c53d0..5e86eac4bfa 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -27,7 +27,6 @@
#include <linux/vmalloc.h>
#include <linux/smp.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
@@ -41,7 +40,7 @@
* The normal show_free_areas() is too verbose on Tile, with dozens
* of processors and often four NUMA zones each with high and lowmem.
*/
-void show_mem(void)
+void show_mem(unsigned int filter)
{
struct zone *zone;
@@ -62,7 +61,7 @@ void show_mem(void)
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
global_page_state(NR_FILE_PAGES),
- nr_swap_pages);
+ get_nr_swap_pages());
for_each_zone(zone) {
unsigned long flags, order, total = 0, largest_order = -1;
@@ -84,63 +83,72 @@ void show_mem(void)
}
}
-/*
- * Associate a virtual page frame with a given physical page frame
- * and protection flags for that frame.
+/**
+ * shatter_huge_page() - ensure a given address is mapped by a small page.
+ *
+ * This function converts a huge PTE mapping kernel LOWMEM into a bunch
+ * of small PTEs with the same caching. No cache flush required, but we
+ * must do a global TLB flush.
+ *
+ * Any caller that wishes to modify a kernel mapping that might
+ * have been made with a huge page should call this function,
+ * since doing so properly avoids race conditions with installing the
+ * newly-shattered page and then flushing all the TLB entries.
+ *
+ * @addr: Address at which to shatter any existing huge page.
*/
-static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+void shatter_huge_page(unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte;
+ unsigned long flags = 0; /* happy compiler */
+#ifdef __PAGETABLE_PMD_FOLDED
+ struct list_head *pos;
+#endif
- pgd = swapper_pg_dir + pgd_index(vaddr);
- if (pgd_none(*pgd)) {
- BUG();
- return;
- }
- pud = pud_offset(pgd, vaddr);
- if (pud_none(*pud)) {
- BUG();
+ /* Get a pointer to the pmd entry that we need to change. */
+ addr &= HPAGE_MASK;
+ BUG_ON(pgd_addr_invalid(addr));
+ BUG_ON(addr < PAGE_OFFSET); /* only for kernel LOWMEM */
+ pgd = swapper_pg_dir + pgd_index(addr);
+ pud = pud_offset(pgd, addr);
+ BUG_ON(!pud_present(*pud));
+ pmd = pmd_offset(pud, addr);
+ BUG_ON(!pmd_present(*pmd));
+ if (!pmd_huge_page(*pmd))
return;
- }
- pmd = pmd_offset(pud, vaddr);
- if (pmd_none(*pmd)) {
- BUG();
+
+ spin_lock_irqsave(&init_mm.page_table_lock, flags);
+ if (!pmd_huge_page(*pmd)) {
+ /* Lost the race to convert the huge page. */
+ spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
return;
}
- pte = pte_offset_kernel(pmd, vaddr);
- /* <pfn,flags> stored as-is, to permit clearing entries */
- set_pte(pte, pfn_pte(pfn, flags));
-
- /*
- * It's enough to flush this one mapping.
- * This appears conservative since it is only called
- * from __set_fixmap.
- */
- local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
-}
-void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
-{
- unsigned long address = __fix_to_virt(idx);
-
- if (idx >= __end_of_fixed_addresses) {
- BUG();
- return;
+ /* Shatter the huge page into the preallocated L2 page table. */
+ pmd_populate_kernel(&init_mm, pmd, get_prealloc_pte(pmd_pfn(*pmd)));
+
+#ifdef __PAGETABLE_PMD_FOLDED
+ /* Walk every pgd on the system and update the pmd there. */
+ spin_lock(&pgd_lock);
+ list_for_each(pos, &pgd_list) {
+ pmd_t *copy_pmd;
+ pgd = list_to_pgd(pos) + pgd_index(addr);
+ pud = pud_offset(pgd, addr);
+ copy_pmd = pmd_offset(pud, addr);
+ __set_pmd(copy_pmd, *pmd);
}
- set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
-}
+ spin_unlock(&pgd_lock);
+#endif
-#if defined(CONFIG_HIGHPTE)
-pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
-{
- pte_t *pte = kmap_atomic(pmd_page(*dir)) +
- (pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
- return &pte[pte_index(address)];
+ /* Tell every cpu to notice the change. */
+ flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
+ cpu_possible_mask, NULL, 0);
+
+ /* Hold the lock until the TLB flush is finished to avoid races. */
+ spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
}
-#endif
/*
* List of all pgd's needed so it can invalidate entries in both cached
@@ -148,9 +156,13 @@ pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
* against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed.
- * The locking scheme was chosen on the basis of manfred's
- * recommendations and having no core impact whatsoever.
- * -- wli
+ *
+ * The lock is always taken with interrupts disabled, unlike on x86
+ * and other platforms, because we need to take the lock in
+ * shatter_huge_page(), which may be called from an interrupt context.
+ * We are not at risk from the tlbflush IPI deadlock that was seen on
+ * x86, since we use the flush_remote() API to have the hypervisor do
+ * the TLB flushes regardless of irq disabling.
*/
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);
@@ -184,9 +196,9 @@ static void pgd_ctor(pgd_t *pgd)
BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
#endif
- clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
- swapper_pg_dir + KERNEL_PGD_INDEX_START,
- KERNEL_PGD_PTRS);
+ memcpy(pgd + KERNEL_PGD_INDEX_START,
+ swapper_pg_dir + KERNEL_PGD_INDEX_START,
+ KERNEL_PGD_PTRS * sizeof(pgd_t));
pgd_list_add(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
@@ -218,20 +230,32 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
+ int order)
{
- gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
+ gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
struct page *p;
-
-#ifdef CONFIG_HIGHPTE
- flags |= __GFP_HIGHMEM;
-#endif
+ int i;
p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
if (p == NULL)
return NULL;
- pgtable_page_ctor(p);
+ if (!pgtable_page_ctor(p)) {
+ __free_pages(p, L2_USER_PGTABLE_ORDER);
+ return NULL;
+ }
+
+ /*
+ * Make every page have a page_count() of one, not just the first.
+ * We don't use __GFP_COMP since it doesn't look like it works
+ * correctly with tlb_remove_page().
+ */
+ for (i = 1; i < order; ++i) {
+ init_page_count(p+i);
+ inc_zone_page_state(p+i, NR_PAGETABLE);
+ }
+
return p;
}
@@ -240,30 +264,30 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
* process). We have to correct whatever pte_alloc_one() did before
* returning the pages to the allocator.
*/
-void pte_free(struct mm_struct *mm, struct page *p)
+void pgtable_free(struct mm_struct *mm, struct page *p, int order)
{
+ int i;
+
pgtable_page_dtor(p);
- __free_pages(p, L2_USER_PGTABLE_ORDER);
+ __free_page(p);
+
+ for (i = 1; i < order; ++i) {
+ __free_page(p+i);
+ dec_zone_page_state(p+i, NR_PAGETABLE);
+ }
}
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
- unsigned long address)
+void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
+ unsigned long address, int order)
{
int i;
pgtable_page_dtor(pte);
- tlb->need_flush = 1;
- if (tlb_fast_mode(tlb)) {
- struct page *pte_pages[L2_USER_PGTABLE_PAGES];
- for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
- pte_pages[i] = pte + i;
- free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
- return;
- }
- for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
- tlb->pages[tlb->nr++] = pte + i;
- if (tlb->nr >= FREE_PTE_NR)
- tlb_flush_mmu(tlb, 0, 0);
+ tlb_remove_page(tlb, pte);
+
+ for (i = 1; i < order; ++i) {
+ tlb_remove_page(tlb, pte + i);
+ dec_zone_page_state(pte + i, NR_PAGETABLE);
}
}
@@ -304,6 +328,17 @@ void ptep_set_wrprotect(struct mm_struct *mm,
#endif
+/*
+ * Return a pointer to the PTE that corresponds to the given
+ * address in the given page table. A NULL page table just uses
+ * the standard kernel page table; the preferred API in this case
+ * is virt_to_kpte().
+ *
+ * The returned pointer can point to a huge page in other levels
+ * of the page table than the bottom, if the huge page is present
+ * in the page table. For bottom-level PTEs, the returned pointer
+ * can point to a PTE that is either present or not.
+ */
pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
{
pgd_t *pgd;
@@ -317,13 +352,23 @@ pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
pud = pud_offset(pgd, addr);
if (!pud_present(*pud))
return NULL;
+ if (pud_huge_page(*pud))
+ return (pte_t *)pud;
pmd = pmd_offset(pud, addr);
- if (pmd_huge_page(*pmd))
- return (pte_t *)pmd;
if (!pmd_present(*pmd))
return NULL;
+ if (pmd_huge_page(*pmd))
+ return (pte_t *)pmd;
return pte_offset_kernel(pmd, addr);
}
+EXPORT_SYMBOL(virt_to_pte);
+
+pte_t *virt_to_kpte(unsigned long kaddr)
+{
+ BUG_ON(kaddr < PAGE_OFFSET);
+ return virt_to_pte(NULL, kaddr);
+}
+EXPORT_SYMBOL(virt_to_kpte);
pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
{
@@ -346,41 +391,65 @@ int get_remote_cache_cpu(pgprot_t prot)
return x + y * smp_width;
}
-void set_pte_order(pte_t *ptep, pte_t pte, int order)
+/*
+ * Convert a kernel VA to a PA and homing information.
+ */
+int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
{
- unsigned long pfn = pte_pfn(pte);
- struct page *page = pfn_to_page(pfn);
+ struct page *page = virt_to_page(va);
+ pte_t null_pte = { 0 };
+
+ *cpa = __pa(va);
- /* Update the home of a PTE if necessary */
- pte = pte_set_home(pte, page_home(page));
+ /* Note that this is not writing a page table, just returning a pte. */
+ *pte = pte_set_home(null_pte, page_home(page));
+
+ return 0; /* return non-zero if not hfh? */
+}
+EXPORT_SYMBOL(va_to_cpa_and_pte);
+void __set_pte(pte_t *ptep, pte_t pte)
+{
#ifdef __tilegx__
*ptep = pte;
#else
- /*
- * When setting a PTE, write the high bits first, then write
- * the low bits. This sets the "present" bit only after the
- * other bits are in place. If a particular PTE update
- * involves transitioning from one valid PTE to another, it
- * may be necessary to call set_pte_order() more than once,
- * transitioning via a suitable intermediate state.
- * Note that this sequence also means that if we are transitioning
- * from any migrating PTE to a non-migrating one, we will not
- * see a half-updated PTE with the migrating bit off.
- */
-#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
-# error Must write the present and migrating bits last
-#endif
- ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
- barrier();
- ((u32 *)ptep)[0] = (u32)(pte_val(pte));
-#endif
+# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
+# error Must write the present and migrating bits last
+# endif
+ if (pte_present(pte)) {
+ ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
+ barrier();
+ ((u32 *)ptep)[0] = (u32)(pte_val(pte));
+ } else {
+ ((u32 *)ptep)[0] = (u32)(pte_val(pte));
+ barrier();
+ ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
+ }
+#endif /* __tilegx__ */
+}
+
+void set_pte(pte_t *ptep, pte_t pte)
+{
+ if (pte_present(pte) &&
+ (!CHIP_HAS_MMIO() || hv_pte_get_mode(pte) != HV_PTE_MODE_MMIO)) {
+ /* The PTE actually references physical memory. */
+ unsigned long pfn = pte_pfn(pte);
+ if (pfn_valid(pfn)) {
+ /* Update the home of the PTE from the struct page. */
+ pte = pte_set_home(pte, page_home(pfn_to_page(pfn)));
+ } else if (hv_pte_get_mode(pte) == 0) {
+ /* remap_pfn_range(), etc, must supply PTE mode. */
+ panic("set_pte(): out-of-range PFN and mode 0\n");
+ }
+ }
+
+ __set_pte(ptep, pte);
}
/* Can this mm load a PTE with cached_priority set? */
static inline int mm_is_priority_cached(struct mm_struct *mm)
{
- return mm->context.priority_cached;
+ return mm->context.priority_cached != 0;
}
/*
@@ -390,8 +459,8 @@ static inline int mm_is_priority_cached(struct mm_struct *mm)
void start_mm_caching(struct mm_struct *mm)
{
if (!mm_is_priority_cached(mm)) {
- mm->context.priority_cached = -1U;
- hv_set_caching(-1U);
+ mm->context.priority_cached = -1UL;
+ hv_set_caching(-1UL);
}
}
@@ -406,7 +475,7 @@ void start_mm_caching(struct mm_struct *mm)
* Presumably we'll come back later and have more luck and clear
* the value then; for now we'll just keep the cache marked for priority.
*/
-static unsigned int update_priority_cached(struct mm_struct *mm)
+static unsigned long update_priority_cached(struct mm_struct *mm)
{
if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
struct vm_area_struct *vm;
@@ -474,20 +543,13 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
addr = area->addr;
if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
phys_addr, pgprot)) {
- remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+ free_vm_area(area);
return NULL;
}
return (__force void __iomem *) (offset + (char *)addr);
}
EXPORT_SYMBOL(ioremap_prot);
-/* Map a PCI MMIO bus address into VA space. */
-void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
-{
- panic("ioremap for PCI MMIO is not supported");
-}
-EXPORT_SYMBOL(ioremap);
-
/* Unmap an MMIO VA mapping. */
void iounmap(volatile void __iomem *addr_in)
{
@@ -505,12 +567,7 @@ void iounmap(volatile void __iomem *addr_in)
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
- read_lock(&vmlist_lock);
- for (p = vmlist; p; p = p->next) {
- if (p->addr == addr)
- break;
- }
- read_unlock(&vmlist_lock);
+ p = find_vm_area((void *)addr);
if (!p) {
pr_err("iounmap: bad address %p\n", addr);