diff options
Diffstat (limited to 'arch/tile/mm')
| -rw-r--r-- | arch/tile/mm/elf.c | 114 | ||||
| -rw-r--r-- | arch/tile/mm/fault.c | 256 | ||||
| -rw-r--r-- | arch/tile/mm/highmem.c | 8 | ||||
| -rw-r--r-- | arch/tile/mm/homecache.c | 185 | ||||
| -rw-r--r-- | arch/tile/mm/hugetlbpage.c | 397 | ||||
| -rw-r--r-- | arch/tile/mm/init.c | 293 | ||||
| -rw-r--r-- | arch/tile/mm/migrate.h | 6 | ||||
| -rw-r--r-- | arch/tile/mm/migrate_32.S | 41 | ||||
| -rw-r--r-- | arch/tile/mm/migrate_64.S | 167 | ||||
| -rw-r--r-- | arch/tile/mm/mmap.c | 26 | ||||
| -rw-r--r-- | arch/tile/mm/pgtable.c | 295 | 
11 files changed, 994 insertions, 794 deletions
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index 55e58e93bfc..23f044e8a7a 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c @@ -21,6 +21,8 @@  #include <asm/pgtable.h>  #include <asm/pgalloc.h>  #include <asm/sections.h> +#include <asm/vdso.h> +#include <arch/sim.h>  /* Notify a running simulator, if any, that an exec just occurred. */  static void sim_notify_exec(const char *binary_name) @@ -35,28 +37,57 @@ static void sim_notify_exec(const char *binary_name)  	} while (c);  } -static int notify_exec(void) +static int notify_exec(struct mm_struct *mm)  { -	int retval = 0;  /* failure */ -	struct vm_area_struct *vma = current->mm->mmap; -	while (vma) { -		if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) +	char *buf, *path; +	struct vm_area_struct *vma; + +	if (!sim_is_simulator()) +		return 1; + +	if (mm->exe_file == NULL) +		return 0; + +	for (vma = current->mm->mmap; ; vma = vma->vm_next) { +		if (vma == NULL) +			return 0; +		if (vma->vm_file == mm->exe_file)  			break; -		vma = vma->vm_next;  	} -	if (vma) { -		char *buf = (char *) __get_free_page(GFP_KERNEL); -		if (buf) { -			char *path = d_path(&vma->vm_file->f_path, -					    buf, PAGE_SIZE); -			if (!IS_ERR(path)) { -				sim_notify_exec(path); -				retval = 1; -			} -			free_page((unsigned long)buf); + +	buf = (char *) __get_free_page(GFP_KERNEL); +	if (buf == NULL) +		return 0; + +	path = d_path(&mm->exe_file->f_path, buf, PAGE_SIZE); +	if (IS_ERR(path)) { +		free_page((unsigned long)buf); +		return 0; +	} + +	/* +	 * Notify simulator of an ET_DYN object so we know the load address. +	 * The somewhat cryptic overuse of SIM_CONTROL_DLOPEN allows us +	 * to be backward-compatible with older simulator releases. +	 */ +	if (vma->vm_start == (ELF_ET_DYN_BASE & PAGE_MASK)) { +		char buf[64]; +		int i; + +		snprintf(buf, sizeof(buf), "0x%lx:@", vma->vm_start); +		for (i = 0; ; ++i) { +			char c = buf[i]; +			__insn_mtspr(SPR_SIM_CONTROL, +				     (SIM_CONTROL_DLOPEN +				      | (c << _SIM_CONTROL_OPERATOR_BITS))); +			if (c == '\0') +				break;  		}  	} -	return retval; + +	sim_notify_exec(path); +	free_page((unsigned long)buf); +	return 1;  }  /* Notify a running simulator, if any, that we loaded an interpreter. */ @@ -72,63 +103,23 @@ static void sim_notify_interp(unsigned long load_addr)  } -/* Kernel address of page used to map read-only kernel data into userspace. */ -static void *vdso_page; - -/* One-entry array used for install_special_mapping. */ -static struct page *vdso_pages[1]; - -static int __init vdso_setup(void) -{ -	vdso_page = (void *)get_zeroed_page(GFP_ATOMIC); -	memcpy(vdso_page, __rt_sigreturn, __rt_sigreturn_end - __rt_sigreturn); -	vdso_pages[0] = virt_to_page(vdso_page); -	return 0; -} -device_initcall(vdso_setup); - -const char *arch_vma_name(struct vm_area_struct *vma) -{ -	if (vma->vm_private_data == vdso_pages) -		return "[vdso]"; -#ifndef __tilegx__ -	if (vma->vm_start == MEM_USER_INTRPT) -		return "[intrpt]"; -#endif -	return NULL; -} -  int arch_setup_additional_pages(struct linux_binprm *bprm,  				int executable_stack)  {  	struct mm_struct *mm = current->mm; -	unsigned long vdso_base;  	int retval = 0; +	down_write(&mm->mmap_sem); +  	/*  	 * Notify the simulator that an exec just occurred.  	 * If we can't find the filename of the mapping, just use  	 * whatever was passed as the linux_binprm filename.  	 */ -	if (!notify_exec()) +	if (!notify_exec(mm))  		sim_notify_exec(bprm->filename); -	down_write(&mm->mmap_sem); - -	/* -	 * MAYWRITE to allow gdb to COW and set breakpoints -	 * -	 * Make sure the vDSO gets into every core dump.  Dumping its -	 * contents makes post-mortem fully interpretable later -	 * without matching up the same kernel and hardware config to -	 * see what PC values meant. -	 */ -	vdso_base = VDSO_BASE; -	retval = install_special_mapping(mm, vdso_base, PAGE_SIZE, -					 VM_READ|VM_EXEC| -					 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| -					 VM_ALWAYSDUMP, -					 vdso_pages); +	retval = setup_vdso_pages();  #ifndef __tilegx__  	/* @@ -140,7 +131,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,  	if (!retval) {  		unsigned long addr = MEM_USER_INTRPT;  		addr = mmap_region(NULL, addr, INTRPT_SIZE, -				   MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,  				   VM_READ|VM_EXEC|  				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);  		if (addr > (unsigned long) -PAGE_SIZE) diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index f295b4ac941..6c0571216a9 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -24,7 +24,6 @@  #include <linux/mman.h>  #include <linux/mm.h>  #include <linux/smp.h> -#include <linux/smp_lock.h>  #include <linux/interrupt.h>  #include <linux/init.h>  #include <linux/tty.h> @@ -35,8 +34,8 @@  #include <linux/hugetlb.h>  #include <linux/syscalls.h>  #include <linux/uaccess.h> +#include <linux/kdebug.h> -#include <asm/system.h>  #include <asm/pgalloc.h>  #include <asm/sections.h>  #include <asm/traps.h> @@ -44,15 +43,18 @@  #include <arch/interrupts.h> -static noinline void force_sig_info_fault(int si_signo, int si_code, -	unsigned long address, int fault_num, struct task_struct *tsk) +static noinline void force_sig_info_fault(const char *type, int si_signo, +					  int si_code, unsigned long address, +					  int fault_num, +					  struct task_struct *tsk, +					  struct pt_regs *regs)  {  	siginfo_t info;  	if (unlikely(tsk->pid < 2)) {  		panic("Signal %d (code %d) at %#lx sent to %s!",  		      si_signo, si_code & 0xffff, address, -		      tsk->pid ? "init" : "the idle task"); +		      is_idle_task(tsk) ? "the idle task" : "init");  	}  	info.si_signo = si_signo; @@ -60,6 +62,7 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,  	info.si_code = si_code;  	info.si_addr = (void __user *)address;  	info.si_trapno = fault_num; +	trace_unhandled_signal(type, regs, address, si_signo);  	force_sig_info(si_signo, &info, tsk);  } @@ -68,15 +71,17 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,   * Synthesize the fault a PL0 process would get by doing a word-load of   * an unaligned address or a high kernel address.   */ -SYSCALL_DEFINE2(cmpxchg_badaddr, unsigned long, address, -		struct pt_regs *, regs) +SYSCALL_DEFINE1(cmpxchg_badaddr, unsigned long, address)  { +	struct pt_regs *regs = current_pt_regs(); +  	if (address >= PAGE_OFFSET) -		force_sig_info_fault(SIGSEGV, SEGV_MAPERR, address, -				     INT_DTLB_MISS, current); +		force_sig_info_fault("atomic segfault", SIGSEGV, SEGV_MAPERR, +				     address, INT_DTLB_MISS, current, regs);  	else -		force_sig_info_fault(SIGBUS, BUS_ADRALN, address, -				     INT_UNALIGN_DATA, current); +		force_sig_info_fault("atomic alignment fault", SIGBUS, +				     BUS_ADRALN, address, +				     INT_UNALIGN_DATA, current, regs);  	/*  	 * Adjust pc to point at the actual instruction, which is unusual @@ -118,16 +123,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)  	pmd_k = pmd_offset(pud_k, address);  	if (!pmd_present(*pmd_k))  		return NULL; -	if (!pmd_present(*pmd)) { +	if (!pmd_present(*pmd))  		set_pmd(pmd, *pmd_k); -		arch_flush_lazy_mmu_mode(); -	} else +	else  		BUG_ON(pmd_ptfn(*pmd) != pmd_ptfn(*pmd_k));  	return pmd_k;  }  /* - * Handle a fault on the vmalloc or module mapping area + * Handle a fault on the vmalloc area.   */  static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)  { @@ -145,8 +149,6 @@ static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)  	pmd_k = vmalloc_sync_one(pgd, address);  	if (!pmd_k)  		return -1; -	if (pmd_huge(*pmd_k)) -		return 0;   /* support TILE huge_vmap() API */  	pte_k = pte_offset_kernel(pmd_k, address);  	if (!pte_present(*pte_k))  		return -1; @@ -184,7 +186,7 @@ static pgd_t *get_current_pgd(void)  	HV_Context ctx = hv_inquire_context();  	unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT;  	struct page *pgd_page = pfn_to_page(pgd_pfn); -	BUG_ON(PageHighMem(pgd_page));   /* oops, HIGHPTE? */ +	BUG_ON(PageHighMem(pgd_page));  	return (pgd_t *) __va(ctx.page_table);  } @@ -200,9 +202,14 @@ static pgd_t *get_current_pgd(void)   * interrupt or a critical region, and must do as little as possible.   * Similarly, we can't use atomic ops here, since we may be handling a   * fault caused by an atomic op access. + * + * If we find a migrating PTE while we're in an NMI context, and we're + * at a PC that has a registered exception handler, we don't wait, + * since this thread may (e.g.) have been interrupted while migrating + * its own stack, which would then cause us to self-deadlock.   */  static int handle_migrating_pte(pgd_t *pgd, int fault_num, -				unsigned long address, +				unsigned long address, unsigned long pc,  				int is_kernel_mode, int write)  {  	pud_t *pud; @@ -224,6 +231,8 @@ static int handle_migrating_pte(pgd_t *pgd, int fault_num,  		pte_offset_kernel(pmd, address);  	pteval = *pte;  	if (pte_migrating(pteval)) { +		if (in_nmi() && search_exception_tables(pc)) +			return 0;  		wait_for_migration(pte);  		return 1;  	} @@ -263,12 +272,15 @@ static int handle_page_fault(struct pt_regs *regs,  	int si_code;  	int is_kernel_mode;  	pgd_t *pgd; +	unsigned int flags;  	/* on TILE, protection faults are always writes */  	if (!is_page_fault)  		write = 1; -	is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL); +	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + +	is_kernel_mode = !user_mode(regs);  	tsk = validate_current(); @@ -291,13 +303,13 @@ static int handle_page_fault(struct pt_regs *regs,  	/*  	 * Early on, we need to check for migrating PTE entries;  	 * see homecache.c.  If we find a migrating PTE, we wait until -	 * the backing page claims to be done migrating, then we procede. +	 * the backing page claims to be done migrating, then we proceed.  	 * For kernel PTEs, we rewrite the PTE and return and retry.  	 * Otherwise, we treat the fault like a normal "no PTE" fault,  	 * rather than trying to patch up the existing PTE.  	 */  	pgd = get_current_pgd(); -	if (handle_migrating_pte(pgd, fault_num, address, +	if (handle_migrating_pte(pgd, fault_num, address, regs->pc,  				 is_kernel_mode, write))  		return 1; @@ -332,9 +344,12 @@ static int handle_page_fault(struct pt_regs *regs,  	/*  	 * If we're trying to touch user-space addresses, we must  	 * be either at PL0, or else with interrupts enabled in the -	 * kernel, so either way we can re-enable interrupts here. +	 * kernel, so either way we can re-enable interrupts here +	 * unless we are doing atomic access to user space with +	 * interrupts disabled.  	 */ -	local_irq_enable(); +	if (!(regs->flags & PT_FLAGS_DISABLE_IRQ)) +		local_irq_enable();  	mm = tsk->mm; @@ -347,6 +362,9 @@ static int handle_page_fault(struct pt_regs *regs,  		goto bad_area_nosemaphore;  	} +	if (!is_kernel_mode) +		flags |= FAULT_FLAG_USER; +  	/*  	 * When running in the kernel we expect faults to occur only to  	 * addresses in user space.  All other faults represent errors in the @@ -369,6 +387,8 @@ static int handle_page_fault(struct pt_regs *regs,  			vma = NULL;  /* happy compiler */  			goto bad_area_nosemaphore;  		} + +retry:  		down_read(&mm->mmap_sem);  	} @@ -405,18 +425,22 @@ good_area:  #endif  		if (!(vma->vm_flags & VM_WRITE))  			goto bad_area; +		flags |= FAULT_FLAG_WRITE;  	} else {  		if (!is_page_fault || !(vma->vm_flags & VM_READ))  			goto bad_area;  	} - survive:  	/*  	 * If for any reason at all we couldn't handle the fault,  	 * make sure we exit gracefully rather than endlessly redo  	 * the fault.  	 */ -	fault = handle_mm_fault(mm, vma, address, write); +	fault = handle_mm_fault(mm, vma, address, flags); + +	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) +		return 0; +  	if (unlikely(fault & VM_FAULT_ERROR)) {  		if (fault & VM_FAULT_OOM)  			goto out_of_memory; @@ -424,33 +448,33 @@ good_area:  			goto do_sigbus;  		BUG();  	} -	if (fault & VM_FAULT_MAJOR) -		tsk->maj_flt++; -	else -		tsk->min_flt++; +	if (flags & FAULT_FLAG_ALLOW_RETRY) { +		if (fault & VM_FAULT_MAJOR) +			tsk->maj_flt++; +		else +			tsk->min_flt++; +		if (fault & VM_FAULT_RETRY) { +			flags &= ~FAULT_FLAG_ALLOW_RETRY; +			flags |= FAULT_FLAG_TRIED; + +			 /* +			  * No need to up_read(&mm->mmap_sem) as we would +			  * have already released it in __lock_page_or_retry +			  * in mm/filemap.c. +			  */ +			goto retry; +		} +	} -#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() -	/* -	 * If this was an asynchronous fault, -	 * restart the appropriate engine. -	 */ -	switch (fault_num) {  #if CHIP_HAS_TILE_DMA() +	/* If this was a DMA TLB fault, restart the DMA engine. */ +	switch (fault_num) {  	case INT_DMATLB_MISS:  	case INT_DMATLB_MISS_DWNCL:  	case INT_DMATLB_ACCESS:  	case INT_DMATLB_ACCESS_DWNCL:  		__insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);  		break; -#endif -#if CHIP_HAS_SN_PROC() -	case INT_SNITLB_MISS: -	case INT_SNITLB_MISS_DWNCL: -		__insn_mtspr(SPR_SNCTL, -			     __insn_mfspr(SPR_SNCTL) & -			     ~SPR_SNCTL__FRZPROC_MASK); -		break; -#endif  	}  #endif @@ -472,8 +496,8 @@ bad_area_nosemaphore:  		 */  		local_irq_enable(); -		force_sig_info_fault(SIGSEGV, si_code, address, -				     fault_num, tsk); +		force_sig_info_fault("segfault", SIGSEGV, si_code, address, +				     fault_num, tsk, regs);  		return 0;  	} @@ -511,7 +535,7 @@ no_context:  	if (unlikely(tsk->pid < 2)) {  		panic("Kernel page fault running %s!", -		      tsk->pid ? "init" : "the idle task"); +		      is_idle_task(tsk) ? "the idle task" : "init");  	}  	/* @@ -531,15 +555,10 @@ no_context:   */  out_of_memory:  	up_read(&mm->mmap_sem); -	if (is_global_init(tsk)) { -		yield(); -		down_read(&mm->mmap_sem); -		goto survive; -	} -	pr_alert("VM: killing process %s\n", tsk->comm); -	if (!is_kernel_mode) -		do_group_exit(SIGKILL); -	goto no_context; +	if (is_kernel_mode) +		goto no_context; +	pagefault_out_of_memory(); +	return 0;  do_sigbus:  	up_read(&mm->mmap_sem); @@ -548,7 +567,8 @@ do_sigbus:  	if (is_kernel_mode)  		goto no_context; -	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, fault_num, tsk); +	force_sig_info_fault("bus error", SIGBUS, BUS_ADRERR, address, +			     fault_num, tsk, regs);  	return 0;  } @@ -656,20 +676,12 @@ struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,  	}  	/* -	 * NOTE: the one other type of access that might bring us here -	 * are the memory ops in __tns_atomic_acquire/__tns_atomic_release, -	 * but we don't have to check specially for them since we can -	 * always safely return to the address of the fault and retry, -	 * since no separate atomic locks are involved. -	 */ - -	/*  	 * Now that we have released the atomic lock (if necessary),  	 * it's safe to spin if the PTE that caused the fault was migrating.  	 */  	if (fault_num == INT_DTLB_ACCESS)  		write = 1; -	if (handle_migrating_pte(pgd, fault_num, address, 1, write)) +	if (handle_migrating_pte(pgd, fault_num, address, pc, 1, write))  		return state;  	/* Return zero so that we continue on with normal fault handling. */ @@ -692,8 +704,60 @@ void do_page_fault(struct pt_regs *regs, int fault_num,  {  	int is_page_fault; +#ifdef CONFIG_KPROBES +	/* +	 * This is to notify the fault handler of the kprobes.  The +	 * exception code is redundant as it is also carried in REGS, +	 * but we pass it anyhow. +	 */ +	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, -1, +		       regs->faultnum, SIGSEGV) == NOTIFY_STOP) +		return; +#endif + +#ifdef __tilegx__ +	/* +	 * We don't need early do_page_fault_ics() support, since unlike +	 * Pro we don't need to worry about unlocking the atomic locks. +	 * There is only one current case in GX where we touch any memory +	 * under ICS other than our own kernel stack, and we handle that +	 * here.  (If we crash due to trying to touch our own stack, +	 * we're in too much trouble for C code to help out anyway.) +	 */ +	if (write & ~1) { +		unsigned long pc = write & ~1; +		if (pc >= (unsigned long) __start_unalign_asm_code && +		    pc < (unsigned long) __end_unalign_asm_code) { +			struct thread_info *ti = current_thread_info(); +			/* +			 * Our EX_CONTEXT is still what it was from the +			 * initial unalign exception, but now we've faulted +			 * on the JIT page.  We would like to complete the +			 * page fault however is appropriate, and then retry +			 * the instruction that caused the unalign exception. +			 * Our state has been "corrupted" by setting the low +			 * bit in "sp", and stashing r0..r3 in the +			 * thread_info area, so we revert all of that, then +			 * continue as if this were a normal page fault. +			 */ +			regs->sp &= ~1UL; +			regs->regs[0] = ti->unalign_jit_tmp[0]; +			regs->regs[1] = ti->unalign_jit_tmp[1]; +			regs->regs[2] = ti->unalign_jit_tmp[2]; +			regs->regs[3] = ti->unalign_jit_tmp[3]; +			write &= 1; +		} else { +			pr_alert("%s/%d: ICS set at page fault at %#lx: %#lx\n", +				 current->comm, current->pid, pc, address); +			show_regs(regs); +			do_group_exit(SIGKILL); +			return; +		} +	} +#else  	/* This case should have been handled by do_page_fault_ics(). */  	BUG_ON(write & ~1); +#endif  #if CHIP_HAS_TILE_DMA()  	/* @@ -722,10 +786,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,  	case INT_DMATLB_MISS:  	case INT_DMATLB_MISS_DWNCL:  #endif -#if CHIP_HAS_SN_PROC() -	case INT_SNITLB_MISS: -	case INT_SNITLB_MISS_DWNCL: -#endif  		is_page_fault = 1;  		break; @@ -741,7 +801,8 @@ void do_page_fault(struct pt_regs *regs, int fault_num,  		panic("Bad fault number %d in do_page_fault", fault_num);  	} -	if (EX1_PL(regs->ex1) != USER_PL) { +#if CHIP_HAS_TILE_DMA() +	if (!user_mode(regs)) {  		struct async_tlb *async;  		switch (fault_num) {  #if CHIP_HAS_TILE_DMA() @@ -752,12 +813,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,  			async = ¤t->thread.dma_async_tlb;  			break;  #endif -#if CHIP_HAS_SN_PROC() -		case INT_SNITLB_MISS: -		case INT_SNITLB_MISS_DWNCL: -			async = ¤t->thread.sn_async_tlb; -			break; -#endif  		default:  			async = NULL;  		} @@ -784,19 +839,28 @@ void do_page_fault(struct pt_regs *regs, int fault_num,  			return;  		}  	} +#endif  	handle_page_fault(regs, fault_num, is_page_fault, address, write);  } -#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() +#if CHIP_HAS_TILE_DMA()  /* - * Check an async_tlb structure to see if a deferred fault is waiting, - * and if so pass it to the page-fault code. + * This routine effectively re-issues asynchronous page faults + * when we are returning to user space.   */ -static void handle_async_page_fault(struct pt_regs *regs, -				    struct async_tlb *async) +void do_async_page_fault(struct pt_regs *regs)  { +	struct async_tlb *async = ¤t->thread.dma_async_tlb; + +	/* +	 * Clear thread flag early.  If we re-interrupt while processing +	 * code here, we will reset it and recall this routine before +	 * returning to user space. +	 */ +	clear_thread_flag(TIF_ASYNC_TLB); +  	if (async->fault_num) {  		/*  		 * Clear async->fault_num before calling the page-fault @@ -810,35 +874,15 @@ static void handle_async_page_fault(struct pt_regs *regs,  				  async->address, async->is_write);  	}  } -#endif /* CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() */ - - -/* - * This routine effectively re-issues asynchronous page faults - * when we are returning to user space. - */ -void do_async_page_fault(struct pt_regs *regs) -{ -	/* -	 * Clear thread flag early.  If we re-interrupt while processing -	 * code here, we will reset it and recall this routine before -	 * returning to user space. -	 */ -	clear_thread_flag(TIF_ASYNC_TLB); +#endif /* CHIP_HAS_TILE_DMA() */ -#if CHIP_HAS_TILE_DMA() -	handle_async_page_fault(regs, ¤t->thread.dma_async_tlb); -#endif -#if CHIP_HAS_SN_PROC() -	handle_async_page_fault(regs, ¤t->thread.sn_async_tlb); -#endif -}  void vmalloc_sync_all(void)  {  #ifdef __tilegx__  	/* Currently all L1 kernel pmd's are static and shared. */ -	BUG_ON(pgd_index(VMALLOC_END) != pgd_index(VMALLOC_START)); +	BUILD_BUG_ON(pgd_index(VMALLOC_END - PAGE_SIZE) != +		     pgd_index(VMALLOC_START));  #else  	/*  	 * Note that races in the updates of insync and start aren't diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c index 31dbbd9afe4..0dc21829477 100644 --- a/arch/tile/mm/highmem.c +++ b/arch/tile/mm/highmem.c @@ -93,7 +93,7 @@ static DEFINE_PER_CPU(struct kmap_amps, amps);   * If we examine it earlier we are exposed to a race where it looks   * writable earlier, but becomes immutable before we write the PTE.   */ -static void kmap_atomic_register(struct page *page, enum km_type type, +static void kmap_atomic_register(struct page *page, int type,  				 unsigned long va, pte_t *ptep, pte_t pteval)  {  	unsigned long flags; @@ -114,7 +114,6 @@ static void kmap_atomic_register(struct page *page, enum km_type type,  	list_add(&->list, &_list);  	set_pte(ptep, pteval); -	arch_flush_lazy_mmu_mode();  	spin_unlock(&_lock);  	homecache_kpte_unlock(flags); @@ -224,12 +223,12 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)  }  EXPORT_SYMBOL(kmap_atomic_prot); -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page)  {  	/* PAGE_NONE is a magic value that tells us to check immutability. */  	return kmap_atomic_prot(page, PAGE_NONE);  } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic);  void __kunmap_atomic(void *kvaddr)  { @@ -259,7 +258,6 @@ void __kunmap_atomic(void *kvaddr)  		BUG_ON(vaddr >= (unsigned long)high_memory);  	} -	arch_flush_lazy_mmu_mode();  	pagefault_enable();  }  EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index d78df3a6ee1..33294fdc402 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c @@ -30,6 +30,7 @@  #include <linux/cache.h>  #include <linux/smp.h>  #include <linux/module.h> +#include <linux/hugetlb.h>  #include <asm/page.h>  #include <asm/sections.h> @@ -42,12 +43,9 @@  #include "migrate.h" -#if CHIP_HAS_COHERENT_LOCAL_CACHE() -  /*   * The noallocl2 option suppresses all use of the L2 cache to cache - * locally from a remote home.  There's no point in using it if we - * don't have coherent local caching, though. + * locally from a remote home.   */  static int __write_once noallocl2;  static int __init set_noallocl2(char *str) @@ -57,16 +55,6 @@ static int __init set_noallocl2(char *str)  }  early_param("noallocl2", set_noallocl2); -#else - -#define noallocl2 0 - -#endif - -/* Provide no-op versions of these routines to keep flush_remote() cleaner. */ -#define mark_caches_evicted_start() 0 -#define mark_caches_evicted_finish(mask, timestamp) do {} while (0) -  /*   * Update the irq_stat for cpus that we are going to interrupt @@ -106,7 +94,6 @@ static void hv_flush_update(const struct cpumask *cache_cpumask,   *    there's never any good reason for hv_flush_remote() to fail.   *  - Accepts a 32-bit PFN rather than a 64-bit PA, which generally   *    is the type that Linux wants to pass around anyway. - *  - Centralizes the mark_caches_evicted() handling.   *  - Canonicalizes that lengths of zero make cpumasks NULL.   *  - Handles deferring TLB flushes for dataplane tiles.   *  - Tracks remote interrupts in the per-cpu irq_cpustat_t. @@ -125,7 +112,6 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,  		  HV_Remote_ASID *asids, int asidcount)  {  	int rc; -	int timestamp = 0;  /* happy compiler */  	struct cpumask cache_cpumask_copy, tlb_cpumask_copy;  	struct cpumask *cache_cpumask, *tlb_cpumask;  	HV_PhysAddr cache_pa; @@ -156,15 +142,11 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,  	hv_flush_update(cache_cpumask, tlb_cpumask, tlb_va, tlb_length,  			asids, asidcount);  	cache_pa = (HV_PhysAddr)cache_pfn << PAGE_SHIFT; -	if (cache_control & HV_FLUSH_EVICT_L2) -		timestamp = mark_caches_evicted_start();  	rc = hv_flush_remote(cache_pa, cache_control,  			     cpumask_bits(cache_cpumask),  			     tlb_va, tlb_length, tlb_pgsize,  			     cpumask_bits(tlb_cpumask),  			     asids, asidcount); -	if (cache_control & HV_FLUSH_EVICT_L2) -		mark_caches_evicted_finish(cache_cpumask, timestamp);  	if (rc == 0)  		return;  	cpumask_scnprintf(cache_buf, sizeof(cache_buf), &cache_cpumask_copy); @@ -179,59 +161,88 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,  	panic("Unsafe to continue.");  } -void homecache_evict(const struct cpumask *mask) +static void homecache_finv_page_va(void* va, int home)  { -	flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0); +	int cpu = get_cpu(); +	if (home == cpu) { +		finv_buffer_local(va, PAGE_SIZE); +	} else if (home == PAGE_HOME_HASH) { +		finv_buffer_remote(va, PAGE_SIZE, 1); +	} else { +		BUG_ON(home < 0 || home >= NR_CPUS); +		finv_buffer_remote(va, PAGE_SIZE, 0); +	} +	put_cpu();  } -/* Return a mask of the cpus whose caches currently own these pages. */ -static void homecache_mask(struct page *page, int pages, -			   struct cpumask *home_mask) +void homecache_finv_map_page(struct page *page, int home)  { -	int i; -	cpumask_clear(home_mask); -	for (i = 0; i < pages; ++i) { -		int home = page_home(&page[i]); -		if (home == PAGE_HOME_IMMUTABLE || -		    home == PAGE_HOME_INCOHERENT) { -			cpumask_copy(home_mask, cpu_possible_mask); -			return; -		} -#if CHIP_HAS_CBOX_HOME_MAP() -		if (home == PAGE_HOME_HASH) { -			cpumask_or(home_mask, home_mask, &hash_for_home_map); -			continue; -		} +	unsigned long flags; +	unsigned long va; +	pte_t *ptep; +	pte_t pte; + +	if (home == PAGE_HOME_UNCACHED) +		return; +	local_irq_save(flags); +#ifdef CONFIG_HIGHMEM +	va = __fix_to_virt(FIX_KMAP_BEGIN + kmap_atomic_idx_push() + +			   (KM_TYPE_NR * smp_processor_id())); +#else +	va = __fix_to_virt(FIX_HOMECACHE_BEGIN + smp_processor_id());  #endif -		if (home == PAGE_HOME_UNCACHED) -			continue; -		BUG_ON(home < 0 || home >= NR_CPUS); -		cpumask_set_cpu(home, home_mask); -	} +	ptep = virt_to_kpte(va); +	pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL); +	__set_pte(ptep, pte_set_home(pte, home)); +	homecache_finv_page_va((void *)va, home); +	__pte_clear(ptep); +	hv_flush_page(va, PAGE_SIZE); +#ifdef CONFIG_HIGHMEM +	kmap_atomic_idx_pop(); +#endif +	local_irq_restore(flags);  } -/* - * Return the passed length, or zero if it's long enough that we - * believe we should evict the whole L2 cache. - */ -static unsigned long cache_flush_length(unsigned long length) +static void homecache_finv_page_home(struct page *page, int home) +{ +	if (!PageHighMem(page) && home == page_home(page)) +		homecache_finv_page_va(page_address(page), home); +	else +		homecache_finv_map_page(page, home); +} + +static inline bool incoherent_home(int home) +{ +	return home == PAGE_HOME_IMMUTABLE || home == PAGE_HOME_INCOHERENT; +} + +static void homecache_finv_page_internal(struct page *page, int force_map)  { -	return (length >= CHIP_L2_CACHE_SIZE()) ? HV_FLUSH_EVICT_L2 : length; +	int home = page_home(page); +	if (home == PAGE_HOME_UNCACHED) +		return; +	if (incoherent_home(home)) { +		int cpu; +		for_each_cpu(cpu, &cpu_cacheable_map) +			homecache_finv_map_page(page, cpu); +	} else if (force_map) { +		/* Force if, e.g., the normal mapping is migrating. */ +		homecache_finv_map_page(page, home); +	} else { +		homecache_finv_page_home(page, home); +	} +	sim_validate_lines_evicted(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);  } -/* Flush a page out of whatever cache(s) it is in. */ -void homecache_flush_cache(struct page *page, int order) +void homecache_finv_page(struct page *page)  { -	int pages = 1 << order; -	int length = cache_flush_length(pages * PAGE_SIZE); -	unsigned long pfn = page_to_pfn(page); -	struct cpumask home_mask; - -	homecache_mask(page, pages, &home_mask); -	flush_remote(pfn, length, &home_mask, 0, 0, 0, NULL, NULL, 0); -	sim_validate_lines_evicted(PFN_PHYS(pfn), pages * PAGE_SIZE); +	homecache_finv_page_internal(page, 0);  } +void homecache_evict(const struct cpumask *mask) +{ +	flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0); +}  /* Report the home corresponding to a given PTE. */  static int pte_to_home(pte_t pte) @@ -245,10 +256,8 @@ static int pte_to_home(pte_t pte)  		return PAGE_HOME_INCOHERENT;  	case HV_PTE_MODE_UNCACHED:  		return PAGE_HOME_UNCACHED; -#if CHIP_HAS_CBOX_HOME_MAP()  	case HV_PTE_MODE_CACHE_HASH_L3:  		return PAGE_HOME_HASH; -#endif  	}  	panic("Bad PTE %#llx\n", pte.val);  } @@ -305,20 +314,16 @@ pte_t pte_set_home(pte_t pte, int home)  						      HV_PTE_MODE_CACHE_NO_L3);  			}  		} else -#if CHIP_HAS_CBOX_HOME_MAP()  		if (hash_default)  			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);  		else -#endif  			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);  		pte = hv_pte_set_nc(pte);  		break; -#if CHIP_HAS_CBOX_HOME_MAP()  	case PAGE_HOME_HASH:  		pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);  		break; -#endif  	default:  		BUG_ON(home < 0 || home >= NR_CPUS || @@ -328,7 +333,6 @@ pte_t pte_set_home(pte_t pte, int home)  		break;  	} -#if CHIP_HAS_NC_AND_NOALLOC_BITS()  	if (noallocl2)  		pte = hv_pte_set_no_alloc_l2(pte); @@ -337,7 +341,6 @@ pte_t pte_set_home(pte_t pte, int home)  	    hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_NO_L3) {  		pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);  	} -#endif  	/* Checking this case here gives a better panic than from the hv. */  	BUG_ON(hv_pte_get_mode(pte) == 0); @@ -353,21 +356,16 @@ EXPORT_SYMBOL(pte_set_home);   * so they're not suitable for anything but infrequent use.   */ -#if CHIP_HAS_CBOX_HOME_MAP() -static inline int initial_page_home(void) { return PAGE_HOME_HASH; } -#else -static inline int initial_page_home(void) { return 0; } -#endif -  int page_home(struct page *page)  {  	if (PageHighMem(page)) { -		return initial_page_home(); +		return PAGE_HOME_HASH;  	} else {  		unsigned long kva = (unsigned long)page_address(page); -		return pte_to_home(*virt_to_pte(NULL, kva)); +		return pte_to_home(*virt_to_kpte(kva));  	}  } +EXPORT_SYMBOL(page_home);  void homecache_change_page_home(struct page *page, int order, int home)  { @@ -383,12 +381,13 @@ void homecache_change_page_home(struct page *page, int order, int home)  		     NULL, 0);  	for (i = 0; i < pages; ++i, kva += PAGE_SIZE) { -		pte_t *ptep = virt_to_pte(NULL, kva); +		pte_t *ptep = virt_to_kpte(kva);  		pte_t pteval = *ptep;  		BUG_ON(!pte_present(pteval) || pte_huge(pteval)); -		*ptep = pte_set_home(pteval, home); +		__set_pte(ptep, pte_set_home(pteval, home));  	}  } +EXPORT_SYMBOL(homecache_change_page_home);  struct page *homecache_alloc_pages(gfp_t gfp_mask,  				   unsigned int order, int home) @@ -413,19 +412,25 @@ struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,  	return page;  } -void homecache_free_pages(unsigned long addr, unsigned int order) +void __homecache_free_pages(struct page *page, unsigned int order)  { -	struct page *page; - -	if (addr == 0) -		return; - -	VM_BUG_ON(!virt_addr_valid((void *)addr)); -	page = virt_to_page((void *)addr);  	if (put_page_testzero(page)) { -		int pages = (1 << order); -		homecache_change_page_home(page, order, initial_page_home()); -		while (pages--) -			__free_page(page++); +		homecache_change_page_home(page, order, PAGE_HOME_HASH); +		if (order == 0) { +			free_hot_cold_page(page, false); +		} else { +			init_page_count(page); +			__free_pages(page, order); +		} +	} +} +EXPORT_SYMBOL(__homecache_free_pages); + +void homecache_free_pages(unsigned long addr, unsigned int order) +{ +	if (addr != 0) { +		VM_BUG_ON(!virt_addr_valid((void *)addr)); +		__homecache_free_pages(virt_to_page((void *)addr), order);  	}  } +EXPORT_SYMBOL(homecache_free_pages); diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 24688b697a8..e514899e110 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -21,92 +21,135 @@  #include <linux/mm.h>  #include <linux/hugetlb.h>  #include <linux/pagemap.h> -#include <linux/smp_lock.h>  #include <linux/slab.h>  #include <linux/err.h>  #include <linux/sysctl.h>  #include <linux/mman.h>  #include <asm/tlb.h>  #include <asm/tlbflush.h> +#include <asm/setup.h> + +#ifdef CONFIG_HUGETLB_SUPER_PAGES + +/* + * Provide an additional huge page size (in addition to the regular default + * huge page size) if no "hugepagesz" arguments are specified. + * Note that it must be smaller than the default huge page size so + * that it's possible to allocate them on demand from the buddy allocator. + * You can change this to 64K (on a 16K build), 256K, 1M, or 4M, + * or not define it at all. + */ +#define ADDITIONAL_HUGE_SIZE (1024 * 1024UL) + +/* "Extra" page-size multipliers, one per level of the page table. */ +int huge_shift[HUGE_SHIFT_ENTRIES] = { +#ifdef ADDITIONAL_HUGE_SIZE +#define ADDITIONAL_HUGE_SHIFT __builtin_ctzl(ADDITIONAL_HUGE_SIZE / PAGE_SIZE) +	[HUGE_SHIFT_PAGE] = ADDITIONAL_HUGE_SHIFT +#endif +}; + +#endif  pte_t *huge_pte_alloc(struct mm_struct *mm,  		      unsigned long addr, unsigned long sz)  {  	pgd_t *pgd;  	pud_t *pud; -	pte_t *pte = NULL; -	/* We do not yet support multiple huge page sizes. */ -	BUG_ON(sz != PMD_SIZE); +	addr &= -sz;   /* Mask off any low bits in the address. */  	pgd = pgd_offset(mm, addr);  	pud = pud_alloc(mm, pgd, addr); -	if (pud) -		pte = (pte_t *) pmd_alloc(mm, pud, addr); -	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); -	return pte; +#ifdef CONFIG_HUGETLB_SUPER_PAGES +	if (sz >= PGDIR_SIZE) { +		BUG_ON(sz != PGDIR_SIZE && +		       sz != PGDIR_SIZE << huge_shift[HUGE_SHIFT_PGDIR]); +		return (pte_t *)pud; +	} else { +		pmd_t *pmd = pmd_alloc(mm, pud, addr); +		if (sz >= PMD_SIZE) { +			BUG_ON(sz != PMD_SIZE && +			       sz != (PMD_SIZE << huge_shift[HUGE_SHIFT_PMD])); +			return (pte_t *)pmd; +		} +		else { +			if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE]) +				panic("Unexpected page size %#lx\n", sz); +			return pte_alloc_map(mm, NULL, pmd, addr); +		} +	} +#else +	BUG_ON(sz != PMD_SIZE); +	return (pte_t *) pmd_alloc(mm, pud, addr); +#endif  } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +static pte_t *get_pte(pte_t *base, int index, int level)  { -	pgd_t *pgd; -	pud_t *pud; -	pmd_t *pmd = NULL; - -	pgd = pgd_offset(mm, addr); -	if (pgd_present(*pgd)) { -		pud = pud_offset(pgd, addr); -		if (pud_present(*pud)) -			pmd = pmd_offset(pud, addr); +	pte_t *ptep = base + index; +#ifdef CONFIG_HUGETLB_SUPER_PAGES +	if (!pte_present(*ptep) && huge_shift[level] != 0) { +		unsigned long mask = -1UL << huge_shift[level]; +		pte_t *super_ptep = base + (index & mask); +		pte_t pte = *super_ptep; +		if (pte_present(pte) && pte_super(pte)) +			ptep = super_ptep;  	} -	return (pte_t *) pmd; +#endif +	return ptep;  } -#ifdef HUGETLB_TEST -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, -			      int write) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)  { -	unsigned long start = address; -	int length = 1; -	int nr; -	struct page *page; -	struct vm_area_struct *vma; - -	vma = find_vma(mm, addr); -	if (!vma || !is_vm_hugetlb_page(vma)) -		return ERR_PTR(-EINVAL); - -	pte = huge_pte_offset(mm, address); - -	/* hugetlb should be locked, and hence, prefaulted */ -	WARN_ON(!pte || pte_none(*pte)); - -	page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - -	WARN_ON(!PageHead(page)); +	pgd_t *pgd; +	pud_t *pud; +	pmd_t *pmd; +#ifdef CONFIG_HUGETLB_SUPER_PAGES +	pte_t *pte; +#endif -	return page; -} +	/* Get the top-level page table entry. */ +	pgd = (pgd_t *)get_pte((pte_t *)mm->pgd, pgd_index(addr), 0); -int pmd_huge(pmd_t pmd) -{ -	return 0; -} +	/* We don't have four levels. */ +	pud = pud_offset(pgd, addr); +#ifndef __PAGETABLE_PUD_FOLDED +# error support fourth page table level +#endif +	if (!pud_present(*pud)) +		return NULL; + +	/* Check for an L0 huge PTE, if we have three levels. */ +#ifndef __PAGETABLE_PMD_FOLDED +	if (pud_huge(*pud)) +		return (pte_t *)pud; + +	pmd = (pmd_t *)get_pte((pte_t *)pud_page_vaddr(*pud), +			       pmd_index(addr), 1); +	if (!pmd_present(*pmd)) +		return NULL; +#else +	pmd = pmd_offset(pud, addr); +#endif -int pud_huge(pud_t pud) -{ -	return 0; -} +	/* Check for an L1 huge PTE. */ +	if (pmd_huge(*pmd)) +		return (pte_t *)pmd; + +#ifdef CONFIG_HUGETLB_SUPER_PAGES +	/* Check for an L2 huge PTE. */ +	pte = get_pte((pte_t *)pmd_page_vaddr(*pmd), pte_index(addr), 2); +	if (!pte_present(*pte)) +		return NULL; +	if (pte_super(*pte)) +		return pte; +#endif -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, -			     pmd_t *pmd, int write) -{  	return NULL;  } -#else -  struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,  			      int write)  { @@ -150,50 +193,21 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)  	return 0;  } -#endif -  #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA  static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,  		unsigned long addr, unsigned long len,  		unsigned long pgoff, unsigned long flags)  {  	struct hstate *h = hstate_file(file); -	struct mm_struct *mm = current->mm; -	struct vm_area_struct *vma; -	unsigned long start_addr; - -	if (len > mm->cached_hole_size) { -		start_addr = mm->free_area_cache; -	} else { -		start_addr = TASK_UNMAPPED_BASE; -		mm->cached_hole_size = 0; -	} - -full_search: -	addr = ALIGN(start_addr, huge_page_size(h)); - -	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { -		/* At this point:  (!vma || addr < vma->vm_end). */ -		if (TASK_SIZE - len < addr) { -			/* -			 * Start a new search - just in case we missed -			 * some holes. -			 */ -			if (start_addr != TASK_UNMAPPED_BASE) { -				start_addr = TASK_UNMAPPED_BASE; -				mm->cached_hole_size = 0; -				goto full_search; -			} -			return -ENOMEM; -		} -		if (!vma || addr + len <= vma->vm_start) { -			mm->free_area_cache = addr + len; -			return addr; -		} -		if (addr + mm->cached_hole_size < vma->vm_start) -			mm->cached_hole_size = vma->vm_start - addr; -		addr = ALIGN(vma->vm_end, huge_page_size(h)); -	} +	struct vm_unmapped_area_info info; + +	info.flags = 0; +	info.length = len; +	info.low_limit = TASK_UNMAPPED_BASE; +	info.high_limit = TASK_SIZE; +	info.align_mask = PAGE_MASK & ~huge_page_mask(h); +	info.align_offset = 0; +	return vm_unmapped_area(&info);  }  static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, @@ -201,92 +215,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,  		unsigned long pgoff, unsigned long flags)  {  	struct hstate *h = hstate_file(file); -	struct mm_struct *mm = current->mm; -	struct vm_area_struct *vma, *prev_vma; -	unsigned long base = mm->mmap_base, addr = addr0; -	unsigned long largest_hole = mm->cached_hole_size; -	int first_time = 1; - -	/* don't allow allocations above current base */ -	if (mm->free_area_cache > base) -		mm->free_area_cache = base; - -	if (len <= largest_hole) { -		largest_hole = 0; -		mm->free_area_cache  = base; -	} -try_again: -	/* make sure it can fit in the remaining address space */ -	if (mm->free_area_cache < len) -		goto fail; - -	/* either no address requested or cant fit in requested address hole */ -	addr = (mm->free_area_cache - len) & huge_page_mask(h); -	do { -		/* -		 * Lookup failure means no vma is above this address, -		 * i.e. return with success: -		 */ -		vma = find_vma_prev(mm, addr, &prev_vma); -		if (!vma) { -			return addr; -			break; -		} +	struct vm_unmapped_area_info info; +	unsigned long addr; -		/* -		 * new region fits between prev_vma->vm_end and -		 * vma->vm_start, use it: -		 */ -		if (addr + len <= vma->vm_start && -			    (!prev_vma || (addr >= prev_vma->vm_end))) { -			/* remember the address as a hint for next time */ -			mm->cached_hole_size = largest_hole; -			mm->free_area_cache = addr; -			return addr; -		} else { -			/* pull free_area_cache down to the first hole */ -			if (mm->free_area_cache == vma->vm_end) { -				mm->free_area_cache = vma->vm_start; -				mm->cached_hole_size = largest_hole; -			} -		} +	info.flags = VM_UNMAPPED_AREA_TOPDOWN; +	info.length = len; +	info.low_limit = PAGE_SIZE; +	info.high_limit = current->mm->mmap_base; +	info.align_mask = PAGE_MASK & ~huge_page_mask(h); +	info.align_offset = 0; +	addr = vm_unmapped_area(&info); -		/* remember the largest hole we saw so far */ -		if (addr + largest_hole < vma->vm_start) -			largest_hole = vma->vm_start - addr; - -		/* try just below the current vma->vm_start */ -		addr = (vma->vm_start - len) & huge_page_mask(h); - -	} while (len <= vma->vm_start); - -fail: -	/* -	 * if hint left us with no space for the requested -	 * mapping then try again: -	 */ -	if (first_time) { -		mm->free_area_cache = base; -		largest_hole = 0; -		first_time = 0; -		goto try_again; -	}  	/*  	 * A failed mmap() very likely causes application failure,  	 * so fall back to the bottom-up function here. This scenario  	 * can happen with large stack limits and large mmap()  	 * allocations.  	 */ -	mm->free_area_cache = TASK_UNMAPPED_BASE; -	mm->cached_hole_size = ~0UL; -	addr = hugetlb_get_unmapped_area_bottomup(file, addr0, -			len, pgoff, flags); - -	/* -	 * Restore the topdown base: -	 */ -	mm->free_area_cache = base; -	mm->cached_hole_size = ~0UL; +	if (addr & ~PAGE_MASK) { +		VM_BUG_ON(addr != -ENOMEM); +		info.flags = 0; +		info.low_limit = TASK_UNMAPPED_BASE; +		info.high_limit = TASK_SIZE; +		addr = vm_unmapped_area(&info); +	}  	return addr;  } @@ -323,21 +275,102 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,  		return hugetlb_get_unmapped_area_topdown(file, addr, len,  				pgoff, flags);  } +#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ -static __init int setup_hugepagesz(char *opt) +#ifdef CONFIG_HUGETLB_SUPER_PAGES +static __init int __setup_hugepagesz(unsigned long ps)  { -	unsigned long ps = memparse(opt, &opt); -	if (ps == PMD_SIZE) { -		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); -	} else if (ps == PUD_SIZE) { -		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); +	int log_ps = __builtin_ctzl(ps); +	int level, base_shift; + +	if ((1UL << log_ps) != ps || (log_ps & 1) != 0) { +		pr_warn("Not enabling %ld byte huge pages;" +			" must be a power of four.\n", ps); +		return -EINVAL; +	} + +	if (ps > 64*1024*1024*1024UL) { +		pr_warn("Not enabling %ld MB huge pages;" +			" largest legal value is 64 GB .\n", ps >> 20); +		return -EINVAL; +	} else if (ps >= PUD_SIZE) { +		static long hv_jpage_size; +		if (hv_jpage_size == 0) +			hv_jpage_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO); +		if (hv_jpage_size != PUD_SIZE) { +			pr_warn("Not enabling >= %ld MB huge pages:" +				" hypervisor reports size %ld\n", +				PUD_SIZE >> 20, hv_jpage_size); +			return -EINVAL; +		} +		level = 0; +		base_shift = PUD_SHIFT; +	} else if (ps >= PMD_SIZE) { +		level = 1; +		base_shift = PMD_SHIFT; +	} else if (ps > PAGE_SIZE) { +		level = 2; +		base_shift = PAGE_SHIFT;  	} else { -		pr_err("hugepagesz: Unsupported page size %lu M\n", -			ps >> 20); -		return 0; +		pr_err("hugepagesz: huge page size %ld too small\n", ps); +		return -EINVAL; +	} + +	if (log_ps != base_shift) { +		int shift_val = log_ps - base_shift; +		if (huge_shift[level] != 0) { +			int old_shift = base_shift + huge_shift[level]; +			pr_warn("Not enabling %ld MB huge pages;" +				" already have size %ld MB.\n", +				ps >> 20, (1UL << old_shift) >> 20); +			return -EINVAL; +		} +		if (hv_set_pte_super_shift(level, shift_val) != 0) { +			pr_warn("Not enabling %ld MB huge pages;" +				" no hypervisor support.\n", ps >> 20); +			return -EINVAL; +		} +		printk(KERN_DEBUG "Enabled %ld MB huge pages\n", ps >> 20); +		huge_shift[level] = shift_val; +	} + +	hugetlb_add_hstate(log_ps - PAGE_SHIFT); + +	return 0; +} + +static bool saw_hugepagesz; + +static __init int setup_hugepagesz(char *opt) +{ +	if (!saw_hugepagesz) { +		saw_hugepagesz = true; +		memset(huge_shift, 0, sizeof(huge_shift));  	} -	return 1; +	return __setup_hugepagesz(memparse(opt, NULL));  }  __setup("hugepagesz=", setup_hugepagesz); -#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ +#ifdef ADDITIONAL_HUGE_SIZE +/* + * Provide an additional huge page size if no "hugepagesz" args are given. + * In that case, all the cores have properly set up their hv super_shift + * already, but we need to notify the hugetlb code to enable the + * new huge page size from the Linux point of view. + */ +static __init int add_default_hugepagesz(void) +{ +	if (!saw_hugepagesz) { +		BUILD_BUG_ON(ADDITIONAL_HUGE_SIZE >= PMD_SIZE || +			     ADDITIONAL_HUGE_SIZE <= PAGE_SIZE); +		BUILD_BUG_ON((PAGE_SIZE << ADDITIONAL_HUGE_SHIFT) != +			     ADDITIONAL_HUGE_SIZE); +		BUILD_BUG_ON(ADDITIONAL_HUGE_SHIFT & 1); +		hugetlb_add_hstate(ADDITIONAL_HUGE_SHIFT); +	} +	return 0; +} +arch_initcall(add_default_hugepagesz); +#endif + +#endif /* CONFIG_HUGETLB_SUPER_PAGES */ diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index 0b9ce69b0ee..bfb3127b4df 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -38,7 +38,6 @@  #include <linux/uaccess.h>  #include <asm/mmu_context.h>  #include <asm/processor.h> -#include <asm/system.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h>  #include <asm/dma.h> @@ -53,26 +52,13 @@  #include "migrate.h" -/* - * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)" - * in the Tile Kconfig, but this generates configure warnings. - * Do it here and force people to get it right to compile this file. - * The problem is that with 4KB small pages and 16MB huge pages, - * the default value doesn't allow us to group enough small pages - * together to make up a huge page. - */ -#if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1 -# error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size" -#endif -  #define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))  #ifndef __tilegx__  unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE; +EXPORT_SYMBOL(VMALLOC_RESERVE);  #endif -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -  /* Create an L2 page table */  static pte_t * __init alloc_pte(void)  { @@ -96,7 +82,7 @@ static int num_l2_ptes[MAX_NUMNODES];  static void init_prealloc_ptes(int node, int pages)  { -	BUG_ON(pages & (HV_L2_ENTRIES-1)); +	BUG_ON(pages & (PTRS_PER_PTE - 1));  	if (pages) {  		num_l2_ptes[node] = pages;  		l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t), @@ -120,10 +106,8 @@ pte_t *get_prealloc_pte(unsigned long pfn)   */  static int initial_heap_home(void)  { -#if CHIP_HAS_CBOX_HOME_MAP()  	if (hash_default)  		return PAGE_HOME_HASH; -#endif  	return smp_processor_id();  } @@ -145,14 +129,9 @@ static void __init assign_pte(pmd_t *pmd, pte_t *page_table)  #ifdef __tilegx__ -#if HV_L1_SIZE != HV_L2_SIZE -# error Rework assumption that L1 and L2 page tables are same size. -#endif - -/* Since pmd_t arrays and pte_t arrays are the same size, just use casts. */  static inline pmd_t *alloc_pmd(void)  { -	return (pmd_t *)alloc_pte(); +	return __alloc_bootmem(L1_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0);  }  static inline void assign_pmd(pud_t *pud, pmd_t *pmd) @@ -169,7 +148,21 @@ void __init shatter_pmd(pmd_t *pmd)  	assign_pte(pmd, pte);  } -#ifdef CONFIG_HIGHMEM +#ifdef __tilegx__ +static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va) +{ +	pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va); +	if (pud_none(*pud)) +		assign_pmd(pud, alloc_pmd()); +	return pmd_offset(pud, va); +} +#else +static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va) +{ +	return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va); +} +#endif +  /*   * This function initializes a certain range of kernel virtual memory   * with new bootmem page tables, everywhere page tables are missing in @@ -182,34 +175,24 @@ void __init shatter_pmd(pmd_t *pmd)   * checking the pgd every time.   */  static void __init page_table_range_init(unsigned long start, -					 unsigned long end, pgd_t *pgd_base) +					 unsigned long end, pgd_t *pgd)  { -	pgd_t *pgd; -	int pgd_idx;  	unsigned long vaddr; - -	vaddr = start; -	pgd_idx = pgd_index(vaddr); -	pgd = pgd_base + pgd_idx; - -	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { -		pmd_t *pmd = pmd_offset(pud_offset(pgd, vaddr), vaddr); +	start = round_down(start, PMD_SIZE); +	end = round_up(end, PMD_SIZE); +	for (vaddr = start; vaddr < end; vaddr += PMD_SIZE) { +		pmd_t *pmd = get_pmd(pgd, vaddr);  		if (pmd_none(*pmd))  			assign_pte(pmd, alloc_pte()); -		vaddr += PMD_SIZE;  	}  } -#endif /* CONFIG_HIGHMEM */ - -#if CHIP_HAS_CBOX_HOME_MAP()  static int __initdata ktext_hash = 1;  /* .text pages */  static int __initdata kdata_hash = 1;  /* .data and .bss pages */  int __write_once hash_default = 1;     /* kernel allocator pages */  EXPORT_SYMBOL(hash_default);  int __write_once kstack_hash = 1;      /* if no homecaching, use h4h */ -#endif /* CHIP_HAS_CBOX_HOME_MAP */  /*   * CPUs to use to for striping the pages of kernel data.  If hash-for-home @@ -227,14 +210,12 @@ int __write_once kdata_huge;       /* if no homecaching, small pages */  static pgprot_t __init construct_pgprot(pgprot_t prot, int home)  {  	prot = pte_set_home(prot, home); -#if CHIP_HAS_CBOX_HOME_MAP()  	if (home == PAGE_HOME_IMMUTABLE) {  		if (ktext_hash)  			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3);  		else  			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3);  	} -#endif  	return prot;  } @@ -246,40 +227,28 @@ static pgprot_t __init init_pgprot(ulong address)  {  	int cpu;  	unsigned long page; -	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET }; +	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET }; -#if CHIP_HAS_CBOX_HOME_MAP()  	/* For kdata=huge, everything is just hash-for-home. */  	if (kdata_huge)  		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH); -#endif  	/* We map the aliased pages of permanent text inaccessible. */  	if (address < (ulong) _sinittext - CODE_DELTA)  		return PAGE_NONE; -	/* -	 * We map read-only data non-coherent for performance.  We could -	 * use neighborhood caching on TILE64, but it's not clear it's a win. -	 */ +	/* We map read-only data non-coherent for performance. */  	if ((address >= (ulong) __start_rodata &&  	     address < (ulong) __end_rodata) ||  	    address == (ulong) empty_zero_page) {  		return construct_pgprot(PAGE_KERNEL_RO, PAGE_HOME_IMMUTABLE);  	} -	/* As a performance optimization, keep the boot init stack here. */ -	if (address >= (ulong)&init_thread_union && -	    address < (ulong)&init_thread_union + THREAD_SIZE) -		return construct_pgprot(PAGE_KERNEL, smp_processor_id()); -  #ifndef __tilegx__ -#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()  	/* Force the atomic_locks[] array page to be hash-for-home. */  	if (address == (ulong) atomic_locks)  		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);  #endif -#endif  	/*  	 * Everything else that isn't data or bss is heap, so mark it @@ -297,28 +266,18 @@ static pgprot_t __init init_pgprot(ulong address)  	if (address >= (ulong) _end || address < (ulong) _einitdata)  		return construct_pgprot(PAGE_KERNEL, initial_heap_home()); -#if CHIP_HAS_CBOX_HOME_MAP()  	/* Use hash-for-home if requested for data/bss. */  	if (kdata_hash)  		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH); -#endif - -	/* -	 * Make the w1data homed like heap to start with, to avoid -	 * making it part of the page-striped data area when we're just -	 * going to convert it to read-only soon anyway. -	 */ -	if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end) -		return construct_pgprot(PAGE_KERNEL, initial_heap_home());  	/*  	 * Otherwise we just hand out consecutive cpus.  To avoid  	 * requiring this function to hold state, we just walk forward from -	 * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach -	 * the requested address, while walking cpu home around kdata_mask. -	 * This is typically no more than a dozen or so iterations. +	 * __end_rodata by PAGE_SIZE, skipping the readonly and init data, to +	 * reach the requested address, while walking cpu home around +	 * kdata_mask. This is typically no more than a dozen or so iterations.  	 */ -	page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK; +	page = (((ulong)__end_rodata) + PAGE_SIZE - 1) & PAGE_MASK;  	BUG_ON(address < page || address >= (ulong)_end);  	cpu = cpumask_first(&kdata_mask);  	for (; page < address; page += PAGE_SIZE) { @@ -328,11 +287,9 @@ static pgprot_t __init init_pgprot(ulong address)  		if (page == (ulong)empty_zero_page)  			continue;  #ifndef __tilegx__ -#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()  		if (page == (ulong)atomic_locks)  			continue;  #endif -#endif  		cpu = cpumask_next(cpu, &kdata_mask);  		if (cpu == NR_CPUS)  			cpu = cpumask_first(&kdata_mask); @@ -375,7 +332,7 @@ static int __init setup_ktext(char *str)  	ktext_arg_seen = 1; -	/* Default setting on Tile64: use a huge page */ +	/* Default setting: use a huge page */  	if (strcmp(str, "huge") == 0)  		pr_info("ktext: using one huge locally cached page\n"); @@ -421,31 +378,14 @@ static inline pgprot_t ktext_set_nocache(pgprot_t prot)  {  	if (!ktext_nocache)  		prot = hv_pte_set_nc(prot); -#if CHIP_HAS_NC_AND_NOALLOC_BITS()  	else  		prot = hv_pte_set_no_alloc_l2(prot); -#endif  	return prot;  } -#ifndef __tilegx__ -static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va) -{ -	return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va); -} -#else -static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va) -{ -	pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va); -	if (pud_none(*pud)) -		assign_pmd(pud, alloc_pmd()); -	return pmd_offset(pud, va); -} -#endif -  /* Temporary page table we use for staging. */  static pgd_t pgtables[PTRS_PER_PGD] - __attribute__((section(".init.page"))); + __attribute__((aligned(HV_PAGE_TABLE_ALIGN)));  /*   * This maps the physical memory to kernel virtual address space, a total @@ -463,6 +403,7 @@ static pgd_t pgtables[PTRS_PER_PGD]   */  static void __init kernel_physical_mapping_init(pgd_t *pgd_base)  { +	unsigned long long irqmask;  	unsigned long address, pfn;  	pmd_t *pmd;  	pte_t *pte; @@ -471,7 +412,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)  	struct cpumask kstripe_mask;  	int rc, i; -#if CHIP_HAS_CBOX_HOME_MAP()  	if (ktext_arg_seen && ktext_hash) {  		pr_warning("warning: \"ktext\" boot argument ignored"  			   " if \"kcache_hash\" sets up text hash-for-home\n"); @@ -488,7 +428,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)  			  " kcache_hash=all or =allbutstack\n");  		kdata_huge = 0;  	} -#endif  	/*  	 * Set up a mask for cpus to use for kernel striping. @@ -569,8 +508,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)  		}  	} -	address = MEM_SV_INTRPT; +	address = MEM_SV_START;  	pmd = get_pmd(pgtables, address); +	pfn = 0;  /* code starts at PA 0 */  	if (ktext_small) {  		/* Allocate an L2 PTE for the kernel text */  		int cpu = 0; @@ -592,11 +532,16 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)  			prot = ktext_set_nocache(prot);  		} -		BUG_ON(address != (unsigned long)_stext); -		pfn = 0;  /* code starts at PA 0 */ -		pte = alloc_pte(); -		for (pte_ofs = 0; address < (unsigned long)_einittext; -		     pfn++, pte_ofs++, address += PAGE_SIZE) { +		BUG_ON(address != (unsigned long)_text); +		pte = NULL; +		for (; address < (unsigned long)_einittext; +		     pfn++, address += PAGE_SIZE) { +			pte_ofs = pte_index(address); +			if (pte_ofs == 0) { +				if (pte) +					assign_pte(pmd++, pte); +				pte = alloc_pte(); +			}  			if (!ktext_local) {  				prot = set_remote_cache_cpu(prot, cpu);  				cpu = cpumask_next(cpu, &ktext_mask); @@ -605,17 +550,16 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)  			}  			pte[pte_ofs] = pfn_pte(pfn, prot);  		} -		assign_pte(pmd, pte); +		if (pte) +			assign_pte(pmd, pte);  	} else {  		pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC);  		pteval = pte_mkhuge(pteval); -#if CHIP_HAS_CBOX_HOME_MAP()  		if (ktext_hash) {  			pteval = hv_pte_set_mode(pteval,  						 HV_PTE_MODE_CACHE_HASH_L3);  			pteval = ktext_set_nocache(pteval);  		} else -#endif /* CHIP_HAS_CBOX_HOME_MAP() */  		if (cpumask_weight(&ktext_mask) == 1) {  			pteval = set_remote_cache_cpu(pteval,  					      cpumask_first(&ktext_mask)); @@ -628,7 +572,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)  		else  			pteval = hv_pte_set_mode(pteval,  						 HV_PTE_MODE_CACHE_NO_L3); -		*(pte_t *)pmd = pteval; +		for (; address < (unsigned long)_einittext; +		     pfn += PFN_DOWN(HPAGE_SIZE), address += HPAGE_SIZE) +			*(pte_t *)(pmd++) = pfn_pte(pfn, pteval);  	}  	/* Set swapper_pgprot here so it is flushed to memory right away. */ @@ -643,16 +589,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)  	 *  - install pgtables[] as the real page table  	 *  - flush the TLB so the new page table takes effect  	 */ +	irqmask = interrupt_mask_save_mask(); +	interrupt_mask_set_mask(-1ULL);  	rc = flush_and_install_context(__pa(pgtables),  				       init_pgprot((unsigned long)pgtables),  				       __get_cpu_var(current_asid),  				       cpumask_bits(my_cpu_mask)); +	interrupt_mask_restore_mask(irqmask);  	BUG_ON(rc != 0);  	/* Copy the page table back to the normal swapper_pg_dir. */  	memcpy(pgd_base, pgtables, sizeof(pgtables));  	__install_page_table(pgd_base, __get_cpu_var(current_asid),  			     swapper_pgprot); + +	/* +	 * We just read swapper_pgprot and thus brought it into the cache, +	 * with its new home & caching mode.  When we start the other CPUs, +	 * they're going to reference swapper_pgprot via their initial fake +	 * VA-is-PA mappings, which cache everything locally.  At that +	 * time, if it's in our cache with a conflicting home, the +	 * simulator's coherence checker will complain.  So, flush it out +	 * of our cache; we're not going to ever use it again anyway. +	 */ +	__insn_finv(&swapper_pgprot);  }  /* @@ -698,6 +658,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)  #endif /* CONFIG_HIGHMEM */ +#ifndef CONFIG_64BIT  static void __init init_free_pfn_range(unsigned long start, unsigned long end)  {  	unsigned long pfn; @@ -727,7 +688,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)  		}  		init_page_count(page);  		__free_pages(page, order); -		totalram_pages += count; +		adjust_managed_page_count(page, count);  		page += count;  		pfn += count; @@ -740,16 +701,15 @@ static void __init set_non_bootmem_pages_init(void)  	for_each_zone(z) {  		unsigned long start, end;  		int nid = z->zone_pgdat->node_id; +#ifdef CONFIG_HIGHMEM  		int idx = zone_idx(z); +#endif  		start = z->zone_start_pfn; -		if (start == 0) -			continue;  /* bootmem */  		end = start + z->spanned_pages; -		if (idx == ZONE_NORMAL) { -			BUG_ON(start != node_start_pfn[nid]); -			start = node_free_pfn[nid]; -		} +		start = max(start, node_free_pfn[nid]); +		start = max(start, max_low_pfn); +  #ifdef CONFIG_HIGHMEM  		if (idx == ZONE_HIGHMEM)  			totalhigh_pages += z->spanned_pages; @@ -770,6 +730,7 @@ static void __init set_non_bootmem_pages_init(void)  		init_free_pfn_range(start, end);  	}  } +#endif  /*   * paging_init() sets up the page tables - note that all of lowmem is @@ -777,9 +738,6 @@ static void __init set_non_bootmem_pages_init(void)   */  void __init paging_init(void)  { -#ifdef CONFIG_HIGHMEM -	unsigned long vaddr, end; -#endif  #ifdef __tilegx__  	pud_t *pud;  #endif @@ -787,14 +745,11 @@ void __init paging_init(void)  	kernel_physical_mapping_init(pgd_base); +	/* Fixed mappings, only the page table structure has to be created. */ +	page_table_range_init(fix_to_virt(__end_of_fixed_addresses - 1), +			      FIXADDR_TOP, pgd_base); +  #ifdef CONFIG_HIGHMEM -	/* -	 * Fixed mappings, only the page table structure has to be -	 * created - mappings will be set by set_fixmap(): -	 */ -	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; -	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; -	page_table_range_init(vaddr, end, pgd_base);  	permanent_kmaps_init(pgd_base);  #endif @@ -806,7 +761,7 @@ void __init paging_init(void)  	 * changing init_mm once we get up and running, and there's no  	 * need for e.g. vmalloc_sync_all().  	 */ -	BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END)); +	BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END - 1));  	pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START);  	assign_pmd(pud, alloc_pmd());  #endif @@ -831,15 +786,13 @@ static void __init set_max_mapnr_init(void)  void __init mem_init(void)  { -	int codesize, datasize, initsize;  	int i;  #ifndef __tilegx__  	void *last;  #endif  #ifdef CONFIG_FLATMEM -	if (!mem_map) -		BUG(); +	BUG_ON(!mem_map);  #endif  #ifdef CONFIG_HIGHMEM @@ -857,24 +810,14 @@ void __init mem_init(void)  	set_max_mapnr_init();  	/* this will put all bootmem onto the freelists */ -	totalram_pages += free_all_bootmem(); +	free_all_bootmem(); +#ifndef CONFIG_64BIT  	/* count all remaining LOWMEM and give all HIGHMEM to page allocator */  	set_non_bootmem_pages_init(); +#endif -	codesize =  (unsigned long)&_etext - (unsigned long)&_text; -	datasize =  (unsigned long)&_end - (unsigned long)&_sdata; -	initsize =  (unsigned long)&_einittext - (unsigned long)&_sinittext; -	initsize += (unsigned long)&_einitdata - (unsigned long)&_sinitdata; - -	pr_info("Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n", -		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10), -		num_physpages << (PAGE_SHIFT-10), -		codesize >> 10, -		datasize >> 10, -		initsize >> 10, -		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) -	       ); +	mem_init_print_info(NULL);  	/*  	 * In debug mode, dump some interesting memory mappings. @@ -885,10 +828,6 @@ void __init mem_init(void)  	printk(KERN_DEBUG "  PKMAP   %#lx - %#lx\n",  	       PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP) - 1);  #endif -#ifdef CONFIG_HUGEVMAP -	printk(KERN_DEBUG "  HUGEMAP %#lx - %#lx\n", -	       HUGE_VMAP_BASE, HUGE_VMAP_END - 1); -#endif  	printk(KERN_DEBUG "  VMALLOC %#lx - %#lx\n",  	       _VMALLOC_START, _VMALLOC_END - 1);  #ifdef __tilegx__ @@ -944,41 +883,25 @@ int remove_memory(u64 start, u64 size)  {  	return -EINVAL;  } + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ +	/* TODO */ +	return -EBUSY; +} +#endif  #endif  struct kmem_cache *pgd_cache;  void __init pgtable_cache_init(void)  { -	pgd_cache = kmem_cache_create("pgd", -				PTRS_PER_PGD*sizeof(pgd_t), -				PTRS_PER_PGD*sizeof(pgd_t), -				0, -				NULL); +	pgd_cache = kmem_cache_create("pgd", SIZEOF_PGD, SIZEOF_PGD, 0, NULL);  	if (!pgd_cache)  		panic("pgtable_cache_init(): Cannot create pgd cache");  } -#if !CHIP_HAS_COHERENT_LOCAL_CACHE() -/* - * The __w1data area holds data that is only written during initialization, - * and is read-only and thus freely cacheable thereafter.  Fix the page - * table entries that cover that region accordingly. - */ -static void mark_w1data_ro(void) -{ -	/* Loop over page table entries */ -	unsigned long addr = (unsigned long)__w1data_begin; -	BUG_ON((addr & (PAGE_SIZE-1)) != 0); -	for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) { -		unsigned long pfn = kaddr_to_pfn((void *)addr); -		pte_t *ptep = virt_to_pte(NULL, addr); -		BUG_ON(pte_huge(*ptep));   /* not relevant for kdata_huge */ -		set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO)); -	} -} -#endif -  #ifdef CONFIG_DEBUG_PAGEALLOC  static long __write_once initfree;  #else @@ -989,7 +912,7 @@ static long __write_once initfree = 1;  static int __init set_initfree(char *str)  {  	long val; -	if (strict_strtol(str, 0, &val)) { +	if (kstrtol(str, 0, &val) == 0) {  		initfree = val;  		pr_info("initfree: %s free init pages\n",  			initfree ? "will" : "won't"); @@ -1018,7 +941,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)  		 */  		int pfn = kaddr_to_pfn((void *)addr);  		struct page *page = pfn_to_page(pfn); -		pte_t *ptep = virt_to_pte(NULL, addr); +		pte_t *ptep = virt_to_kpte(addr);  		if (!initfree) {  			/*  			 * If debugging page accesses then do not free @@ -1029,31 +952,24 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)  			pte_clear(&init_mm, addr, ptep);  			continue;  		} -		__ClearPageReserved(page); -		init_page_count(page);  		if (pte_huge(*ptep))  			BUG_ON(!kdata_huge);  		else  			set_pte_at(&init_mm, addr, ptep,  				   pfn_pte(pfn, PAGE_KERNEL));  		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); -		free_page(addr); -		totalram_pages++; +		free_reserved_page(page);  	}  	pr_info("Freeing %s: %ldk freed\n", what, (end - begin) >> 10);  }  void free_initmem(void)  { -	const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET; +	const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;  	/* -	 * Evict the dirty initdata on the boot cpu, evict the w1data -	 * wherever it's homed, and evict all the init code everywhere. -	 * We are guaranteed that no one will touch the init pages any -	 * more, and although other cpus may be touching the w1data, -	 * we only actually change the caching on tile64, which won't -	 * be keeping local copies in the other tiles' caches anyway. +	 * Evict the cache on all cores to avoid incoherence. +	 * We are guaranteed that no one will touch the init pages any more.  	 */  	homecache_evict(&cpu_cacheable_map); @@ -1064,26 +980,11 @@ void free_initmem(void)  	/*  	 * Free the pages mapped from 0xc0000000 that correspond to code -	 * pages from MEM_SV_INTRPT that we won't use again after init. +	 * pages from MEM_SV_START that we won't use again after init.  	 */  	free_init_pages("unused kernel text",  			(unsigned long)_sinittext - text_delta,  			(unsigned long)_einittext - text_delta); - -#if !CHIP_HAS_COHERENT_LOCAL_CACHE() -	/* -	 * Upgrade the .w1data section to globally cached. -	 * We don't do this on tilepro, since the cache architecture -	 * pretty much makes it irrelevant, and in any case we end -	 * up having racing issues with other tiles that may touch -	 * the data after we flush the cache but before we update -	 * the PTEs and flush the TLBs, causing sharer shootdowns -	 * later.  Even though this is to clean data, it seems like -	 * an unnecessary complication. -	 */ -	mark_w1data_ro(); -#endif -  	/* Do a global TLB flush so everyone sees the changes. */  	flush_tlb_all();  } diff --git a/arch/tile/mm/migrate.h b/arch/tile/mm/migrate.h index cd45a0837fa..91683d97917 100644 --- a/arch/tile/mm/migrate.h +++ b/arch/tile/mm/migrate.h @@ -24,6 +24,9 @@  /*   * This function is used as a helper when setting up the initial   * page table (swapper_pg_dir). + * + * You must mask ALL interrupts prior to invoking this code, since + * you can't legally touch the stack during the cache flush.   */  extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,  				     HV_ASID asid, @@ -39,6 +42,9 @@ extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,   *   * Note that any non-NULL pointers must not point to the page that   * is handled by the stack_pte itself. + * + * You must mask ALL interrupts prior to invoking this code, since + * you can't legally touch the stack during the cache flush.   */  extern int homecache_migrate_stack_and_flush(pte_t stack_pte, unsigned long va,  				     size_t length, pte_t *stack_ptep, diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S index f738765cd1e..772085491bf 100644 --- a/arch/tile/mm/migrate_32.S +++ b/arch/tile/mm/migrate_32.S @@ -18,6 +18,7 @@  #include <linux/linkage.h>  #include <linux/threads.h>  #include <asm/page.h> +#include <asm/thread_info.h>  #include <asm/types.h>  #include <asm/asm-offsets.h>  #include <hv/hypervisor.h> @@ -39,8 +40,7 @@  #define FRAME_R32	16  #define FRAME_R33	20  #define FRAME_R34	24 -#define FRAME_R35	28 -#define FRAME_SIZE	32 +#define FRAME_SIZE	28 @@ -65,12 +65,11 @@  #define r_my_cpumask	r5  /* Locals (callee-save); must not be more than FRAME_xxx above. */ -#define r_save_ics	r30 -#define r_context_lo	r31 -#define r_context_hi	r32 -#define r_access_lo	r33 -#define r_access_hi	r34 -#define r_asid		r35 +#define r_context_lo	r30 +#define r_context_hi	r31 +#define r_access_lo	r32 +#define r_access_hi	r33 +#define r_asid		r34  STD_ENTRY(flush_and_install_context)  	/* @@ -103,11 +102,7 @@ STD_ENTRY(flush_and_install_context)  	 sw r_tmp, r33  	 addi r_tmp, sp, FRAME_R34  	} -	{ -	 sw r_tmp, r34 -	 addi r_tmp, sp, FRAME_R35 -	} -	sw r_tmp, r35 +	sw r_tmp, r34  	/* Move some arguments to callee-save registers. */  	{ @@ -120,13 +115,6 @@ STD_ENTRY(flush_and_install_context)  	}  	move r_asid, r_asid_in -	/* Disable interrupts, since we can't use our stack. */ -	{ -	 mfspr r_save_ics, INTERRUPT_CRITICAL_SECTION -	 movei r_tmp, 1 -	} -	mtspr INTERRUPT_CRITICAL_SECTION, r_tmp -  	/* First, flush our L2 cache. */  	{  	 move r0, zero  /* cache_pa */ @@ -148,7 +136,7 @@ STD_ENTRY(flush_and_install_context)  	 move r8, zero  /* asids */  	 move r9, zero  /* asidcount */  	} -	jal hv_flush_remote +	jal _hv_flush_remote  	bnz r0, .Ldone  	/* Now install the new page table. */ @@ -162,9 +150,9 @@ STD_ENTRY(flush_and_install_context)  	}  	{  	 move r4, r_asid -	 movei r5, HV_CTX_DIRECTIO +	 moveli r5, HV_CTX_DIRECTIO | CTX_PAGE_FLAG  	} -	jal hv_install_context +	jal _hv_install_context  	bnz r0, .Ldone  	/* Finally, flush the TLB. */ @@ -174,9 +162,6 @@ STD_ENTRY(flush_and_install_context)  	}  .Ldone: -	/* Reset interrupts back how they were before. */ -	mtspr INTERRUPT_CRITICAL_SECTION, r_save_ics -  	/* Restore the callee-saved registers and return. */  	addli lr, sp, FRAME_SIZE  	{ @@ -201,10 +186,6 @@ STD_ENTRY(flush_and_install_context)  	}  	{  	 lw r34, r_tmp -	 addli r_tmp, sp, FRAME_R35 -	} -	{ -	 lw r35, r_tmp  	 addi sp, sp, FRAME_SIZE  	}  	jrp lr diff --git a/arch/tile/mm/migrate_64.S b/arch/tile/mm/migrate_64.S new file mode 100644 index 00000000000..a49eee38f87 --- /dev/null +++ b/arch/tile/mm/migrate_64.S @@ -0,0 +1,167 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, but + *   WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + *   NON INFRINGEMENT.  See the GNU General Public License for + *   more details. + * + * This routine is a helper for migrating the home of a set of pages to + * a new cpu.  See the documentation in homecache.c for more information. + */ + +#include <linux/linkage.h> +#include <linux/threads.h> +#include <asm/page.h> +#include <asm/thread_info.h> +#include <asm/types.h> +#include <asm/asm-offsets.h> +#include <hv/hypervisor.h> + +	.text + +/* + * First, some definitions that apply to all the code in the file. + */ + +/* Locals (caller-save) */ +#define r_tmp		r10 +#define r_save_sp	r11 + +/* What we save where in the stack frame; must include all callee-saves. */ +#define FRAME_SP	8 +#define FRAME_R30	16 +#define FRAME_R31	24 +#define FRAME_R32	32 +#define FRAME_SIZE	40 + + + + +/* + * On entry: + * + *   r0 the new context PA to install (moved to r_context) + *   r1 PTE to use for context access (moved to r_access) + *   r2 ASID to use for new context (moved to r_asid) + *   r3 pointer to cpumask with just this cpu set in it (r_my_cpumask) + */ + +/* Arguments (caller-save) */ +#define r_context_in	r0 +#define r_access_in	r1 +#define r_asid_in	r2 +#define r_my_cpumask	r3 + +/* Locals (callee-save); must not be more than FRAME_xxx above. */ +#define r_context	r30 +#define r_access	r31 +#define r_asid		r32 + +/* + * Caller-save locals and frame constants are the same as + * for homecache_migrate_stack_and_flush. + */ + +STD_ENTRY(flush_and_install_context) +	/* +	 * Create a stack frame; we can't touch it once we flush the +	 * cache until we install the new page table and flush the TLB. +	 */ +	{ +	 move r_save_sp, sp +	 st sp, lr +	 addi sp, sp, -FRAME_SIZE +	} +	addi r_tmp, sp, FRAME_SP +	{ +	 st r_tmp, r_save_sp +	 addi r_tmp, sp, FRAME_R30 +	} +	{ +	 st r_tmp, r30 +	 addi r_tmp, sp, FRAME_R31 +	} +	{ +	 st r_tmp, r31 +	 addi r_tmp, sp, FRAME_R32 +	} +	st r_tmp, r32 + +	/* Move some arguments to callee-save registers. */ +	{ +	 move r_context, r_context_in +	 move r_access, r_access_in +	} +	move r_asid, r_asid_in + +	/* First, flush our L2 cache. */ +	{ +	 move r0, zero  /* cache_pa */ +	 moveli r1, hw2_last(HV_FLUSH_EVICT_L2)  /* cache_control */ +	} +	{ +	 shl16insli r1, r1, hw1(HV_FLUSH_EVICT_L2) +	 move r2, r_my_cpumask  /* cache_cpumask */ +	} +	{ +	 shl16insli r1, r1, hw0(HV_FLUSH_EVICT_L2) +	 move r3, zero  /* tlb_va */ +	} +	{ +	 move r4, zero  /* tlb_length */ +	 move r5, zero  /* tlb_pgsize */ +	} +	{ +	 move r6, zero  /* tlb_cpumask */ +	 move r7, zero  /* asids */ +	} +	{ +	 move r8, zero  /* asidcount */ +	 jal _hv_flush_remote +	} +	bnez r0, 1f + +	/* Now install the new page table. */ +	{ +	 move r0, r_context +	 move r1, r_access +	} +	{ +	 move r2, r_asid +	 moveli r3, HV_CTX_DIRECTIO | CTX_PAGE_FLAG +	} +	jal _hv_install_context +	bnez r0, 1f + +	/* Finally, flush the TLB. */ +	{ +	 movei r0, 0   /* preserve_global */ +	 jal hv_flush_all +	} + +1:	/* Restore the callee-saved registers and return. */ +	addli lr, sp, FRAME_SIZE +	{ +	 ld lr, lr +	 addli r_tmp, sp, FRAME_R30 +	} +	{ +	 ld r30, r_tmp +	 addli r_tmp, sp, FRAME_R31 +	} +	{ +	 ld r31, r_tmp +	 addli r_tmp, sp, FRAME_R32 +	} +	{ +	 ld r32, r_tmp +	 addi sp, sp, FRAME_SIZE +	} +	jrp lr +	STD_ENDPROC(flush_and_install_context) diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c index f96f4cec602..851a94e6ae5 100644 --- a/arch/tile/mm/mmap.c +++ b/arch/tile/mm/mmap.c @@ -58,18 +58,36 @@ void arch_pick_mmap_layout(struct mm_struct *mm)  #else  	int is_32bit = 0;  #endif +	unsigned long random_factor = 0UL; + +	/* +	 *  8 bits of randomness in 32bit mmaps, 24 address space bits +	 * 12 bits of randomness in 64bit mmaps, 28 address space bits +	 */ +	if (current->flags & PF_RANDOMIZE) { +		if (is_32bit) +			random_factor = get_random_int() % (1<<8); +		else +			random_factor = get_random_int() % (1<<12); + +		random_factor <<= PAGE_SHIFT; +	}  	/*  	 * Use standard layout if the expected stack growth is unlimited  	 * or we are running native 64 bits.  	 */ -	if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) { -		mm->mmap_base = TASK_UNMAPPED_BASE; +	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) { +		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;  		mm->get_unmapped_area = arch_get_unmapped_area; -		mm->unmap_area = arch_unmap_area;  	} else {  		mm->mmap_base = mmap_base(mm);  		mm->get_unmapped_area = arch_get_unmapped_area_topdown; -		mm->unmap_area = arch_unmap_area_topdown;  	}  } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ +	unsigned long range_end = mm->brk + 0x02000000; +	return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 1f5430c53d0..5e86eac4bfa 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -27,7 +27,6 @@  #include <linux/vmalloc.h>  #include <linux/smp.h> -#include <asm/system.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h>  #include <asm/fixmap.h> @@ -41,7 +40,7 @@   * The normal show_free_areas() is too verbose on Tile, with dozens   * of processors and often four NUMA zones each with high and lowmem.   */ -void show_mem(void) +void show_mem(unsigned int filter)  {  	struct zone *zone; @@ -62,7 +61,7 @@ void show_mem(void)  	       global_page_state(NR_PAGETABLE),  	       global_page_state(NR_BOUNCE),  	       global_page_state(NR_FILE_PAGES), -	       nr_swap_pages); +	       get_nr_swap_pages());  	for_each_zone(zone) {  		unsigned long flags, order, total = 0, largest_order = -1; @@ -84,63 +83,72 @@ void show_mem(void)  	}  } -/* - * Associate a virtual page frame with a given physical page frame - * and protection flags for that frame. +/** + * shatter_huge_page() - ensure a given address is mapped by a small page. + * + * This function converts a huge PTE mapping kernel LOWMEM into a bunch + * of small PTEs with the same caching.  No cache flush required, but we + * must do a global TLB flush. + * + * Any caller that wishes to modify a kernel mapping that might + * have been made with a huge page should call this function, + * since doing so properly avoids race conditions with installing the + * newly-shattered page and then flushing all the TLB entries. + * + * @addr: Address at which to shatter any existing huge page.   */ -static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +void shatter_huge_page(unsigned long addr)  {  	pgd_t *pgd;  	pud_t *pud;  	pmd_t *pmd; -	pte_t *pte; +	unsigned long flags = 0;  /* happy compiler */ +#ifdef __PAGETABLE_PMD_FOLDED +	struct list_head *pos; +#endif -	pgd = swapper_pg_dir + pgd_index(vaddr); -	if (pgd_none(*pgd)) { -		BUG(); -		return; -	} -	pud = pud_offset(pgd, vaddr); -	if (pud_none(*pud)) { -		BUG(); +	/* Get a pointer to the pmd entry that we need to change. */ +	addr &= HPAGE_MASK; +	BUG_ON(pgd_addr_invalid(addr)); +	BUG_ON(addr < PAGE_OFFSET);  /* only for kernel LOWMEM */ +	pgd = swapper_pg_dir + pgd_index(addr); +	pud = pud_offset(pgd, addr); +	BUG_ON(!pud_present(*pud)); +	pmd = pmd_offset(pud, addr); +	BUG_ON(!pmd_present(*pmd)); +	if (!pmd_huge_page(*pmd))  		return; -	} -	pmd = pmd_offset(pud, vaddr); -	if (pmd_none(*pmd)) { -		BUG(); + +	spin_lock_irqsave(&init_mm.page_table_lock, flags); +	if (!pmd_huge_page(*pmd)) { +		/* Lost the race to convert the huge page. */ +		spin_unlock_irqrestore(&init_mm.page_table_lock, flags);  		return;  	} -	pte = pte_offset_kernel(pmd, vaddr); -	/* <pfn,flags> stored as-is, to permit clearing entries */ -	set_pte(pte, pfn_pte(pfn, flags)); - -	/* -	 * It's enough to flush this one mapping. -	 * This appears conservative since it is only called -	 * from __set_fixmap. -	 */ -	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE); -} -void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) -{ -	unsigned long address = __fix_to_virt(idx); - -	if (idx >= __end_of_fixed_addresses) { -		BUG(); -		return; +	/* Shatter the huge page into the preallocated L2 page table. */ +	pmd_populate_kernel(&init_mm, pmd, get_prealloc_pte(pmd_pfn(*pmd))); + +#ifdef __PAGETABLE_PMD_FOLDED +	/* Walk every pgd on the system and update the pmd there. */ +	spin_lock(&pgd_lock); +	list_for_each(pos, &pgd_list) { +		pmd_t *copy_pmd; +		pgd = list_to_pgd(pos) + pgd_index(addr); +		pud = pud_offset(pgd, addr); +		copy_pmd = pmd_offset(pud, addr); +		__set_pmd(copy_pmd, *pmd);  	} -	set_pte_pfn(address, phys >> PAGE_SHIFT, flags); -} +	spin_unlock(&pgd_lock); +#endif -#if defined(CONFIG_HIGHPTE) -pte_t *_pte_offset_map(pmd_t *dir, unsigned long address) -{ -	pte_t *pte = kmap_atomic(pmd_page(*dir)) + -		(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK; -	return &pte[pte_index(address)]; +	/* Tell every cpu to notice the change. */ +	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE, +		     cpu_possible_mask, NULL, 0); + +	/* Hold the lock until the TLB flush is finished to avoid races. */ +	spin_unlock_irqrestore(&init_mm.page_table_lock, flags);  } -#endif  /*   * List of all pgd's needed so it can invalidate entries in both cached @@ -148,9 +156,13 @@ pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)   * against pageattr.c; it is the unique case in which a valid change   * of kernel pagetables can't be lazily synchronized by vmalloc faults.   * vmalloc faults work because attached pagetables are never freed. - * The locking scheme was chosen on the basis of manfred's - * recommendations and having no core impact whatsoever. - * -- wli + * + * The lock is always taken with interrupts disabled, unlike on x86 + * and other platforms, because we need to take the lock in + * shatter_huge_page(), which may be called from an interrupt context. + * We are not at risk from the tlbflush IPI deadlock that was seen on + * x86, since we use the flush_remote() API to have the hypervisor do + * the TLB flushes regardless of irq disabling.   */  DEFINE_SPINLOCK(pgd_lock);  LIST_HEAD(pgd_list); @@ -184,9 +196,9 @@ static void pgd_ctor(pgd_t *pgd)  	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);  #endif -	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START, -			swapper_pg_dir + KERNEL_PGD_INDEX_START, -			KERNEL_PGD_PTRS); +	memcpy(pgd + KERNEL_PGD_INDEX_START, +	       swapper_pg_dir + KERNEL_PGD_INDEX_START, +	       KERNEL_PGD_PTRS * sizeof(pgd_t));  	pgd_list_add(pgd);  	spin_unlock_irqrestore(&pgd_lock, flags); @@ -218,20 +230,32 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)  #define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER) -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address, +			       int order)  { -	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP; +	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;  	struct page *p; - -#ifdef CONFIG_HIGHPTE -	flags |= __GFP_HIGHMEM; -#endif +	int i;  	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);  	if (p == NULL)  		return NULL; -	pgtable_page_ctor(p); +	if (!pgtable_page_ctor(p)) { +		__free_pages(p, L2_USER_PGTABLE_ORDER); +		return NULL; +	} + +	/* +	 * Make every page have a page_count() of one, not just the first. +	 * We don't use __GFP_COMP since it doesn't look like it works +	 * correctly with tlb_remove_page(). +	 */ +	for (i = 1; i < order; ++i) { +		init_page_count(p+i); +		inc_zone_page_state(p+i, NR_PAGETABLE); +	} +  	return p;  } @@ -240,30 +264,30 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)   * process).  We have to correct whatever pte_alloc_one() did before   * returning the pages to the allocator.   */ -void pte_free(struct mm_struct *mm, struct page *p) +void pgtable_free(struct mm_struct *mm, struct page *p, int order)  { +	int i; +  	pgtable_page_dtor(p); -	__free_pages(p, L2_USER_PGTABLE_ORDER); +	__free_page(p); + +	for (i = 1; i < order; ++i) { +		__free_page(p+i); +		dec_zone_page_state(p+i, NR_PAGETABLE); +	}  } -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, -		    unsigned long address) +void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte, +			unsigned long address, int order)  {  	int i;  	pgtable_page_dtor(pte); -	tlb->need_flush = 1; -	if (tlb_fast_mode(tlb)) { -		struct page *pte_pages[L2_USER_PGTABLE_PAGES]; -		for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) -			pte_pages[i] = pte + i; -		free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES); -		return; -	} -	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) { -		tlb->pages[tlb->nr++] = pte + i; -		if (tlb->nr >= FREE_PTE_NR) -			tlb_flush_mmu(tlb, 0, 0); +	tlb_remove_page(tlb, pte); + +	for (i = 1; i < order; ++i) { +		tlb_remove_page(tlb, pte + i); +		dec_zone_page_state(pte + i, NR_PAGETABLE);  	}  } @@ -304,6 +328,17 @@ void ptep_set_wrprotect(struct mm_struct *mm,  #endif +/* + * Return a pointer to the PTE that corresponds to the given + * address in the given page table.  A NULL page table just uses + * the standard kernel page table; the preferred API in this case + * is virt_to_kpte(). + * + * The returned pointer can point to a huge page in other levels + * of the page table than the bottom, if the huge page is present + * in the page table.  For bottom-level PTEs, the returned pointer + * can point to a PTE that is either present or not. + */  pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)  {  	pgd_t *pgd; @@ -317,13 +352,23 @@ pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)  	pud = pud_offset(pgd, addr);  	if (!pud_present(*pud))  		return NULL; +	if (pud_huge_page(*pud)) +		return (pte_t *)pud;  	pmd = pmd_offset(pud, addr); -	if (pmd_huge_page(*pmd)) -		return (pte_t *)pmd;  	if (!pmd_present(*pmd))  		return NULL; +	if (pmd_huge_page(*pmd)) +		return (pte_t *)pmd;  	return pte_offset_kernel(pmd, addr);  } +EXPORT_SYMBOL(virt_to_pte); + +pte_t *virt_to_kpte(unsigned long kaddr) +{ +	BUG_ON(kaddr < PAGE_OFFSET); +	return virt_to_pte(NULL, kaddr); +} +EXPORT_SYMBOL(virt_to_kpte);  pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)  { @@ -346,41 +391,65 @@ int get_remote_cache_cpu(pgprot_t prot)  	return x + y * smp_width;  } -void set_pte_order(pte_t *ptep, pte_t pte, int order) +/* + * Convert a kernel VA to a PA and homing information. + */ +int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)  { -	unsigned long pfn = pte_pfn(pte); -	struct page *page = pfn_to_page(pfn); +	struct page *page = virt_to_page(va); +	pte_t null_pte = { 0 }; + +	*cpa = __pa(va); -	/* Update the home of a PTE if necessary */ -	pte = pte_set_home(pte, page_home(page)); +	/* Note that this is not writing a page table, just returning a pte. */ +	*pte = pte_set_home(null_pte, page_home(page)); + +	return 0; /* return non-zero if not hfh? */ +} +EXPORT_SYMBOL(va_to_cpa_and_pte); +void __set_pte(pte_t *ptep, pte_t pte) +{  #ifdef __tilegx__  	*ptep = pte;  #else -	/* -	 * When setting a PTE, write the high bits first, then write -	 * the low bits.  This sets the "present" bit only after the -	 * other bits are in place.  If a particular PTE update -	 * involves transitioning from one valid PTE to another, it -	 * may be necessary to call set_pte_order() more than once, -	 * transitioning via a suitable intermediate state. -	 * Note that this sequence also means that if we are transitioning -	 * from any migrating PTE to a non-migrating one, we will not -	 * see a half-updated PTE with the migrating bit off. -	 */ -#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32 -# error Must write the present and migrating bits last -#endif -	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); -	barrier(); -	((u32 *)ptep)[0] = (u32)(pte_val(pte)); -#endif +# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32 +#  error Must write the present and migrating bits last +# endif +	if (pte_present(pte)) { +		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); +		barrier(); +		((u32 *)ptep)[0] = (u32)(pte_val(pte)); +	} else { +		((u32 *)ptep)[0] = (u32)(pte_val(pte)); +		barrier(); +		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); +	} +#endif /* __tilegx__ */ +} + +void set_pte(pte_t *ptep, pte_t pte) +{ +	if (pte_present(pte) && +	    (!CHIP_HAS_MMIO() || hv_pte_get_mode(pte) != HV_PTE_MODE_MMIO)) { +		/* The PTE actually references physical memory. */ +		unsigned long pfn = pte_pfn(pte); +		if (pfn_valid(pfn)) { +			/* Update the home of the PTE from the struct page. */ +			pte = pte_set_home(pte, page_home(pfn_to_page(pfn))); +		} else if (hv_pte_get_mode(pte) == 0) { +			/* remap_pfn_range(), etc, must supply PTE mode. */ +			panic("set_pte(): out-of-range PFN and mode 0\n"); +		} +	} + +	__set_pte(ptep, pte);  }  /* Can this mm load a PTE with cached_priority set? */  static inline int mm_is_priority_cached(struct mm_struct *mm)  { -	return mm->context.priority_cached; +	return mm->context.priority_cached != 0;  }  /* @@ -390,8 +459,8 @@ static inline int mm_is_priority_cached(struct mm_struct *mm)  void start_mm_caching(struct mm_struct *mm)  {  	if (!mm_is_priority_cached(mm)) { -		mm->context.priority_cached = -1U; -		hv_set_caching(-1U); +		mm->context.priority_cached = -1UL; +		hv_set_caching(-1UL);  	}  } @@ -406,7 +475,7 @@ void start_mm_caching(struct mm_struct *mm)   * Presumably we'll come back later and have more luck and clear   * the value then; for now we'll just keep the cache marked for priority.   */ -static unsigned int update_priority_cached(struct mm_struct *mm) +static unsigned long update_priority_cached(struct mm_struct *mm)  {  	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {  		struct vm_area_struct *vm; @@ -474,20 +543,13 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,  	addr = area->addr;  	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,  			       phys_addr, pgprot)) { -		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); +		free_vm_area(area);  		return NULL;  	}  	return (__force void __iomem *) (offset + (char *)addr);  }  EXPORT_SYMBOL(ioremap_prot); -/* Map a PCI MMIO bus address into VA space. */ -void __iomem *ioremap(resource_size_t phys_addr, unsigned long size) -{ -	panic("ioremap for PCI MMIO is not supported"); -} -EXPORT_SYMBOL(ioremap); -  /* Unmap an MMIO VA mapping. */  void iounmap(volatile void __iomem *addr_in)  { @@ -505,12 +567,7 @@ void iounmap(volatile void __iomem *addr_in)  	   in parallel. Reuse of the virtual address is prevented by  	   leaving it in the global lists until we're done with it.  	   cpa takes care of the direct mappings. */ -	read_lock(&vmlist_lock); -	for (p = vmlist; p; p = p->next) { -		if (p->addr == addr) -			break; -	} -	read_unlock(&vmlist_lock); +	p = find_vm_area((void *)addr);  	if (!p) {  		pr_err("iounmap: bad address %p\n", addr);  | 
