diff options
Diffstat (limited to 'arch/s390/mm/fault.c')
| -rw-r--r-- | arch/s390/mm/fault.c | 664 | 
1 files changed, 412 insertions, 252 deletions
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index fe5701e9efb..3f3b35403d0 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -1,8 +1,6 @@  /* - *  arch/s390/mm/fault.c - *   *  S390 version - *    Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation + *    Copyright IBM Corp. 1999   *    Author(s): Hartmut Penner (hp@de.ibm.com)   *               Ulrich Weigand (uweigand@de.ibm.com)   * @@ -10,6 +8,7 @@   *    Copyright (C) 1995  Linus Torvalds   */ +#include <linux/kernel_stat.h>  #include <linux/perf_event.h>  #include <linux/signal.h>  #include <linux/sched.h> @@ -31,11 +30,10 @@  #include <linux/uaccess.h>  #include <linux/hugetlb.h>  #include <asm/asm-offsets.h> -#include <asm/system.h>  #include <asm/pgtable.h> -#include <asm/s390_ext.h> +#include <asm/irq.h>  #include <asm/mmu_context.h> -#include <asm/compat.h> +#include <asm/facility.h>  #include "../kernel/entry.h"  #ifndef CONFIG_64BIT @@ -51,14 +49,20 @@  #define VM_FAULT_BADCONTEXT	0x010000  #define VM_FAULT_BADMAP		0x020000  #define VM_FAULT_BADACCESS	0x040000 +#define VM_FAULT_SIGNAL		0x080000 +#define VM_FAULT_PFAULT		0x100000 -static unsigned long store_indication; +static unsigned long store_indication __read_mostly; -void fault_init(void) +#ifdef CONFIG_64BIT +static int __init fault_init(void)  { -	if (test_facility(2) && test_facility(75)) +	if (test_facility(75))  		store_indication = 0xc00; +	return 0;  } +early_initcall(fault_init); +#endif  static inline int notify_page_fault(struct pt_regs *regs)  { @@ -102,30 +106,154 @@ void bust_spinlocks(int yes)   * Returns the address space associated with the fault.   * Returns 0 for kernel space and 1 for user space.   */ -static inline int user_space_fault(unsigned long trans_exc_code) +static inline int user_space_fault(struct pt_regs *regs)  { +	unsigned long trans_exc_code; +  	/*  	 * The lowest two bits of the translation exception  	 * identification indicate which paging table was used.  	 */ -	trans_exc_code &= 3; -	if (trans_exc_code == 2) -		/* Access via secondary space, set_fs setting decides */ +	trans_exc_code = regs->int_parm_long & 3; +	if (trans_exc_code == 3) /* home space -> kernel */ +		return 0; +	if (user_mode(regs)) +		return 1; +	if (trans_exc_code == 2) /* secondary space -> set_fs */  		return current->thread.mm_segment.ar4; -	if (user_mode == HOME_SPACE_MODE) -		/* User space if the access has been done via home space. */ -		return trans_exc_code == 3; -	/* -	 * If the user space is not the home space the kernel runs in home -	 * space. Access via secondary space has already been covered, -	 * access via primary space or access register is from user space -	 * and access via home space is from the kernel. -	 */ -	return trans_exc_code != 3; +	if (current->flags & PF_VCPU) +		return 1; +	return 0; +} + +static int bad_address(void *p) +{ +	unsigned long dummy; + +	return probe_kernel_address((unsigned long *)p, dummy); +} + +#ifdef CONFIG_64BIT +static void dump_pagetable(unsigned long asce, unsigned long address) +{ +	unsigned long *table = __va(asce & PAGE_MASK); + +	pr_alert("AS:%016lx ", asce); +	switch (asce & _ASCE_TYPE_MASK) { +	case _ASCE_TYPE_REGION1: +		table = table + ((address >> 53) & 0x7ff); +		if (bad_address(table)) +			goto bad; +		pr_cont("R1:%016lx ", *table); +		if (*table & _REGION_ENTRY_INVALID) +			goto out; +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		/* fallthrough */ +	case _ASCE_TYPE_REGION2: +		table = table + ((address >> 42) & 0x7ff); +		if (bad_address(table)) +			goto bad; +		pr_cont("R2:%016lx ", *table); +		if (*table & _REGION_ENTRY_INVALID) +			goto out; +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		/* fallthrough */ +	case _ASCE_TYPE_REGION3: +		table = table + ((address >> 31) & 0x7ff); +		if (bad_address(table)) +			goto bad; +		pr_cont("R3:%016lx ", *table); +		if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) +			goto out; +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		/* fallthrough */ +	case _ASCE_TYPE_SEGMENT: +		table = table + ((address >> 20) & 0x7ff); +		if (bad_address(table)) +			goto bad; +		pr_cont(KERN_CONT "S:%016lx ", *table); +		if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) +			goto out; +		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); +	} +	table = table + ((address >> 12) & 0xff); +	if (bad_address(table)) +		goto bad; +	pr_cont("P:%016lx ", *table); +out: +	pr_cont("\n"); +	return; +bad: +	pr_cont("BAD\n");  } -static inline void report_user_fault(struct pt_regs *regs, long int_code, -				     int signr, unsigned long address) +#else /* CONFIG_64BIT */ + +static void dump_pagetable(unsigned long asce, unsigned long address) +{ +	unsigned long *table = __va(asce & PAGE_MASK); + +	pr_alert("AS:%08lx ", asce); +	table = table + ((address >> 20) & 0x7ff); +	if (bad_address(table)) +		goto bad; +	pr_cont("S:%08lx ", *table); +	if (*table & _SEGMENT_ENTRY_INVALID) +		goto out; +	table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); +	table = table + ((address >> 12) & 0xff); +	if (bad_address(table)) +		goto bad; +	pr_cont("P:%08lx ", *table); +out: +	pr_cont("\n"); +	return; +bad: +	pr_cont("BAD\n"); +} + +#endif /* CONFIG_64BIT */ + +static void dump_fault_info(struct pt_regs *regs) +{ +	unsigned long asce; + +	pr_alert("Fault in "); +	switch (regs->int_parm_long & 3) { +	case 3: +		pr_cont("home space "); +		break; +	case 2: +		pr_cont("secondary space "); +		break; +	case 1: +		pr_cont("access register "); +		break; +	case 0: +		pr_cont("primary space "); +		break; +	} +	pr_cont("mode while using "); +	if (!user_space_fault(regs)) { +		asce = S390_lowcore.kernel_asce; +		pr_cont("kernel "); +	} +#ifdef CONFIG_PGSTE +	else if ((current->flags & PF_VCPU) && S390_lowcore.gmap) { +		struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; +		asce = gmap->asce; +		pr_cont("gmap "); +	} +#endif +	else { +		asce = S390_lowcore.user_asce; +		pr_cont("user "); +	} +	pr_cont("ASCE.\n"); +	dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK); +} + +static inline void report_user_fault(struct pt_regs *regs, long signr)  {  	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)  		return; @@ -133,10 +261,13 @@ static inline void report_user_fault(struct pt_regs *regs, long int_code,  		return;  	if (!printk_ratelimit())  		return; -	printk("User process fault: interruption code 0x%lX ", int_code); +	printk(KERN_ALERT "User process fault: interruption code 0x%X ", +	       regs->int_code);  	print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN); -	printk("\n"); -	printk("failing address: %lX\n", address); +	printk(KERN_CONT "\n"); +	printk(KERN_ALERT "failing address: %016lx TEID: %016lx\n", +	       regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long); +	dump_fault_info(regs);  	show_regs(regs);  } @@ -144,24 +275,18 @@ static inline void report_user_fault(struct pt_regs *regs, long int_code,   * Send SIGSEGV to task.  This is an external routine   * to keep the stack usage of do_page_fault small.   */ -static noinline void do_sigsegv(struct pt_regs *regs, long int_code, -				int si_code, unsigned long trans_exc_code) +static noinline void do_sigsegv(struct pt_regs *regs, int si_code)  {  	struct siginfo si; -	unsigned long address; -	address = trans_exc_code & __FAIL_ADDR_MASK; -	current->thread.prot_addr = address; -	current->thread.trap_no = int_code; -	report_user_fault(regs, int_code, SIGSEGV, address); +	report_user_fault(regs, SIGSEGV);  	si.si_signo = SIGSEGV;  	si.si_code = si_code; -	si.si_addr = (void __user *) address; +	si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);  	force_sig_info(SIGSEGV, &si, current);  } -static noinline void do_no_context(struct pt_regs *regs, long int_code, -				   unsigned long trans_exc_code) +static noinline void do_no_context(struct pt_regs *regs)  {  	const struct exception_table_entry *fixup;  	unsigned long address; @@ -169,7 +294,7 @@ static noinline void do_no_context(struct pt_regs *regs, long int_code,  	/* Are we prepared to handle this kernel fault?  */  	fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN);  	if (fixup) { -		regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE; +		regs->psw.addr = extable_fixup(fixup) | PSW_ADDR_AMODE;  		return;  	} @@ -177,115 +302,84 @@ static noinline void do_no_context(struct pt_regs *regs, long int_code,  	 * Oops. The kernel tried to access some bad page. We'll have to  	 * terminate things with extreme prejudice.  	 */ -	address = trans_exc_code & __FAIL_ADDR_MASK; -	if (!user_space_fault(trans_exc_code)) +	address = regs->int_parm_long & __FAIL_ADDR_MASK; +	if (!user_space_fault(regs))  		printk(KERN_ALERT "Unable to handle kernel pointer dereference" -		       " at virtual kernel address %p\n", (void *)address); +		       " in virtual kernel address space\n");  	else  		printk(KERN_ALERT "Unable to handle kernel paging request" -		       " at virtual user address %p\n", (void *)address); - -	die("Oops", regs, int_code); +		       " in virtual user address space\n"); +	printk(KERN_ALERT "failing address: %016lx TEID: %016lx\n", +	       regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long); +	dump_fault_info(regs); +	die(regs, "Oops");  	do_exit(SIGKILL);  } -static noinline void do_low_address(struct pt_regs *regs, long int_code, -				    unsigned long trans_exc_code) +static noinline void do_low_address(struct pt_regs *regs)  {  	/* Low-address protection hit in kernel mode means  	   NULL pointer write access in kernel mode.  */  	if (regs->psw.mask & PSW_MASK_PSTATE) {  		/* Low-address protection hit in user mode 'cannot happen'. */ -		die ("Low-address protection", regs, int_code); +		die (regs, "Low-address protection");  		do_exit(SIGKILL);  	} -	do_no_context(regs, int_code, trans_exc_code); +	do_no_context(regs);  } -static noinline void do_sigbus(struct pt_regs *regs, long int_code, -			       unsigned long trans_exc_code) +static noinline void do_sigbus(struct pt_regs *regs)  {  	struct task_struct *tsk = current; -	unsigned long address;  	struct siginfo si;  	/*  	 * Send a sigbus, regardless of whether we were in kernel  	 * or user mode.  	 */ -	address = trans_exc_code & __FAIL_ADDR_MASK; -	tsk->thread.prot_addr = address; -	tsk->thread.trap_no = int_code;  	si.si_signo = SIGBUS;  	si.si_errno = 0;  	si.si_code = BUS_ADRERR; -	si.si_addr = (void __user *) address; +	si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);  	force_sig_info(SIGBUS, &si, tsk);  } -#ifdef CONFIG_S390_EXEC_PROTECT -static noinline int signal_return(struct pt_regs *regs, long int_code, -				  unsigned long trans_exc_code) -{ -	u16 instruction; -	int rc; - -	rc = __get_user(instruction, (u16 __user *) regs->psw.addr); - -	if (!rc && instruction == 0x0a77) { -		clear_tsk_thread_flag(current, TIF_SINGLE_STEP); -		if (is_compat_task()) -			sys32_sigreturn(); -		else -			sys_sigreturn(); -	} else if (!rc && instruction == 0x0aad) { -		clear_tsk_thread_flag(current, TIF_SINGLE_STEP); -		if (is_compat_task()) -			sys32_rt_sigreturn(); -		else -			sys_rt_sigreturn(); -	} else -		do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); -	return 0; -} -#endif /* CONFIG_S390_EXEC_PROTECT */ - -static noinline void do_fault_error(struct pt_regs *regs, long int_code, -				    unsigned long trans_exc_code, int fault) +static noinline void do_fault_error(struct pt_regs *regs, int fault)  {  	int si_code;  	switch (fault) {  	case VM_FAULT_BADACCESS: -#ifdef CONFIG_S390_EXEC_PROTECT -		if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && -		    (trans_exc_code & 3) == 0) { -			signal_return(regs, int_code, trans_exc_code); -			break; -		} -#endif /* CONFIG_S390_EXEC_PROTECT */  	case VM_FAULT_BADMAP:  		/* Bad memory access. Check if it is kernel or user space. */ -		if (regs->psw.mask & PSW_MASK_PSTATE) { +		if (user_mode(regs)) {  			/* User mode accesses just cause a SIGSEGV */  			si_code = (fault == VM_FAULT_BADMAP) ?  				SEGV_MAPERR : SEGV_ACCERR; -			do_sigsegv(regs, int_code, si_code, trans_exc_code); +			do_sigsegv(regs, si_code);  			return;  		}  	case VM_FAULT_BADCONTEXT: -		do_no_context(regs, int_code, trans_exc_code); +	case VM_FAULT_PFAULT: +		do_no_context(regs); +		break; +	case VM_FAULT_SIGNAL: +		if (!user_mode(regs)) +			do_no_context(regs);  		break;  	default: /* fault & VM_FAULT_ERROR */ -		if (fault & VM_FAULT_OOM) -			pagefault_out_of_memory(); -		else if (fault & VM_FAULT_SIGBUS) { +		if (fault & VM_FAULT_OOM) { +			if (!user_mode(regs)) +				do_no_context(regs); +			else +				pagefault_out_of_memory(); +		} else if (fault & VM_FAULT_SIGBUS) {  			/* Kernel mode? Handle exceptions or die */ -			if (!(regs->psw.mask & PSW_MASK_PSTATE)) -				do_no_context(regs, int_code, trans_exc_code); +			if (!user_mode(regs)) +				do_no_context(regs);  			else -				do_sigbus(regs, int_code, trans_exc_code); +				do_sigbus(regs);  		} else  			BUG();  		break; @@ -303,20 +397,31 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code,   *   11       Page translation     ->  Not present       (nullification)   *   3b       Region third trans.  ->  Not present       (nullification)   */ -static inline int do_exception(struct pt_regs *regs, int access, -			       unsigned long trans_exc_code) +static inline int do_exception(struct pt_regs *regs, int access)  { +#ifdef CONFIG_PGSTE +	struct gmap *gmap; +#endif  	struct task_struct *tsk;  	struct mm_struct *mm;  	struct vm_area_struct *vma; +	unsigned long trans_exc_code;  	unsigned long address; -	int fault, write; +	unsigned int flags; +	int fault; + +	tsk = current; +	/* +	 * The instruction that caused the program check has +	 * been nullified. Don't signal single step via SIGTRAP. +	 */ +	clear_pt_regs_flag(regs, PIF_PER_TRAP);  	if (notify_page_fault(regs))  		return 0; -	tsk = current;  	mm = tsk->mm; +	trans_exc_code = regs->int_parm_long;  	/*  	 * Verify that the fault happened in user space, that @@ -324,13 +429,37 @@ static inline int do_exception(struct pt_regs *regs, int access,  	 * user context.  	 */  	fault = VM_FAULT_BADCONTEXT; -	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) +	if (unlikely(!user_space_fault(regs) || in_atomic() || !mm))  		goto out;  	address = trans_exc_code & __FAIL_ADDR_MASK; -	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); +	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); +	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; +	if (user_mode(regs)) +		flags |= FAULT_FLAG_USER; +	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) +		flags |= FAULT_FLAG_WRITE;  	down_read(&mm->mmap_sem); +#ifdef CONFIG_PGSTE +	gmap = (struct gmap *) +		((current->flags & PF_VCPU) ? S390_lowcore.gmap : 0); +	if (gmap) { +		address = __gmap_fault(address, gmap); +		if (address == -EFAULT) { +			fault = VM_FAULT_BADMAP; +			goto out_up; +		} +		if (address == -ENOMEM) { +			fault = VM_FAULT_OOM; +			goto out_up; +		} +		if (gmap->pfault_enabled) +			flags |= FAULT_FLAG_RETRY_NOWAIT; +	} +#endif + +retry:  	fault = VM_FAULT_BADMAP;  	vma = find_vma(mm, address);  	if (!vma) @@ -358,27 +487,49 @@ static inline int do_exception(struct pt_regs *regs, int access,  	 * make sure we exit gracefully rather than endlessly redo  	 * the fault.  	 */ -	write = (access == VM_WRITE || -		 (trans_exc_code & store_indication) == 0x400) ? -		FAULT_FLAG_WRITE : 0; -	fault = handle_mm_fault(mm, vma, address, write); +	fault = handle_mm_fault(mm, vma, address, flags); +	/* No reason to continue if interrupted by SIGKILL. */ +	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { +		fault = VM_FAULT_SIGNAL; +		goto out; +	}  	if (unlikely(fault & VM_FAULT_ERROR))  		goto out_up; -	if (fault & VM_FAULT_MAJOR) { -		tsk->maj_flt++; -		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, -				     regs, address); -	} else { -		tsk->min_flt++; -		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, -				     regs, address); -	}  	/* -	 * The instruction that caused the program check will -	 * be repeated. Don't signal single step via SIGTRAP. +	 * Major/minor page fault accounting is only done on the +	 * initial attempt. If we go through a retry, it is extremely +	 * likely that the page will be found in page cache at that point.  	 */ -	clear_tsk_thread_flag(tsk, TIF_SINGLE_STEP); +	if (flags & FAULT_FLAG_ALLOW_RETRY) { +		if (fault & VM_FAULT_MAJOR) { +			tsk->maj_flt++; +			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, +				      regs, address); +		} else { +			tsk->min_flt++; +			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, +				      regs, address); +		} +		if (fault & VM_FAULT_RETRY) { +#ifdef CONFIG_PGSTE +			if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) { +				/* FAULT_FLAG_RETRY_NOWAIT has been set, +				 * mmap_sem has not been released */ +				current->thread.gmap_pfault = 1; +				fault = VM_FAULT_PFAULT; +				goto out_up; +			} +#endif +			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk +			 * of starvation. */ +			flags &= ~(FAULT_FLAG_ALLOW_RETRY | +				   FAULT_FLAG_RETRY_NOWAIT); +			flags |= FAULT_FLAG_TRIED; +			down_read(&mm->mmap_sem); +			goto retry; +		} +	}  	fault = 0;  out_up:  	up_read(&mm->mmap_sem); @@ -386,102 +537,48 @@ out:  	return fault;  } -void __kprobes do_protection_exception(struct pt_regs *regs, long pgm_int_code, -				       unsigned long trans_exc_code) +void __kprobes do_protection_exception(struct pt_regs *regs)  { +	unsigned long trans_exc_code;  	int fault; -	/* Protection exception is supressing, decrement psw address. */ -	regs->psw.addr -= (pgm_int_code >> 16); +	trans_exc_code = regs->int_parm_long; +	/* +	 * Protection exceptions are suppressing, decrement psw address. +	 * The exception to this rule are aborted transactions, for these +	 * the PSW already points to the correct location. +	 */ +	if (!(regs->int_code & 0x200)) +		regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);  	/*  	 * Check for low-address protection.  This needs to be treated  	 * as a special case because the translation exception code  	 * field is not guaranteed to contain valid data in this case.  	 */  	if (unlikely(!(trans_exc_code & 4))) { -		do_low_address(regs, pgm_int_code, trans_exc_code); +		do_low_address(regs);  		return;  	} -	fault = do_exception(regs, VM_WRITE, trans_exc_code); +	fault = do_exception(regs, VM_WRITE);  	if (unlikely(fault)) -		do_fault_error(regs, 4, trans_exc_code, fault); +		do_fault_error(regs, fault);  } -void __kprobes do_dat_exception(struct pt_regs *regs, long pgm_int_code, -				unsigned long trans_exc_code) +void __kprobes do_dat_exception(struct pt_regs *regs)  {  	int access, fault;  	access = VM_READ | VM_EXEC | VM_WRITE; -#ifdef CONFIG_S390_EXEC_PROTECT -	if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && -	    (trans_exc_code & 3) == 0) -		access = VM_EXEC; -#endif -	fault = do_exception(regs, access, trans_exc_code); +	fault = do_exception(regs, access);  	if (unlikely(fault)) -		do_fault_error(regs, pgm_int_code & 255, trans_exc_code, fault); -} - -#ifdef CONFIG_64BIT -void __kprobes do_asce_exception(struct pt_regs *regs, long pgm_int_code, -				 unsigned long trans_exc_code) -{ -	struct mm_struct *mm = current->mm; -	struct vm_area_struct *vma; - -	if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) -		goto no_context; - -	down_read(&mm->mmap_sem); -	vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); -	up_read(&mm->mmap_sem); - -	if (vma) { -		update_mm(mm, current); -		return; -	} - -	/* User mode accesses just cause a SIGSEGV */ -	if (regs->psw.mask & PSW_MASK_PSTATE) { -		do_sigsegv(regs, pgm_int_code, SEGV_MAPERR, trans_exc_code); -		return; -	} - -no_context: -	do_no_context(regs, pgm_int_code, trans_exc_code); -} -#endif - -int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) -{ -	struct pt_regs regs; -	int access, fault; - -	regs.psw.mask = psw_kernel_bits; -	if (!irqs_disabled()) -		regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; -	regs.psw.addr = (unsigned long) __builtin_return_address(0); -	regs.psw.addr |= PSW_ADDR_AMODE; -	uaddr &= PAGE_MASK; -	access = write ? VM_WRITE : VM_READ; -	fault = do_exception(®s, access, uaddr | 2); -	if (unlikely(fault)) { -		if (fault & VM_FAULT_OOM) { -			pagefault_out_of_memory(); -			fault = 0; -		} else if (fault & VM_FAULT_SIGBUS) -			do_sigbus(®s, pgm_int_code, uaddr); -	} -	return fault ? -EFAULT : 0; +		do_fault_error(regs, fault);  }  #ifdef CONFIG_PFAULT   /*   * 'pfault' pseudo page faults routines.   */ -static ext_int_info_t ext_int_pfault; -static int pfault_disable = 0; +static int pfault_disable;  static int __init nopfault(char *str)  { @@ -491,25 +588,31 @@ static int __init nopfault(char *str)  __setup("nopfault", nopfault); -typedef struct { -	__u16 refdiagc; -	__u16 reffcode; -	__u16 refdwlen; -	__u16 refversn; -	__u64 refgaddr; -	__u64 refselmk; -	__u64 refcmpmk; -	__u64 reserved; -} __attribute__ ((packed, aligned(8))) pfault_refbk_t; +struct pfault_refbk { +	u16 refdiagc; +	u16 reffcode; +	u16 refdwlen; +	u16 refversn; +	u64 refgaddr; +	u64 refselmk; +	u64 refcmpmk; +	u64 reserved; +} __attribute__ ((packed, aligned(8)));  int pfault_init(void)  { -	pfault_refbk_t refbk = -		{ 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, -		  __PF_RES_FIELD }; +	struct pfault_refbk refbk = { +		.refdiagc = 0x258, +		.reffcode = 0, +		.refdwlen = 5, +		.refversn = 2, +		.refgaddr = __LC_CURRENT_PID, +		.refselmk = 1ULL << 48, +		.refcmpmk = 1ULL << 48, +		.reserved = __PF_RES_FIELD };          int rc; -	if (!MACHINE_IS_VM || pfault_disable) +	if (pfault_disable)  		return -1;  	asm volatile(  		"	diag	%1,%0,0x258\n" @@ -518,18 +621,20 @@ int pfault_init(void)  		"2:\n"  		EX_TABLE(0b,1b)  		: "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); -        __ctl_set_bit(0, 9);          return rc;  }  void pfault_fini(void)  { -	pfault_refbk_t refbk = -	{ 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL }; - -	if (!MACHINE_IS_VM || pfault_disable) +	struct pfault_refbk refbk = { +		.refdiagc = 0x258, +		.reffcode = 1, +		.refdwlen = 5, +		.refversn = 2, +	}; + +	if (pfault_disable)  		return; -	__ctl_clear_bit(0,9);  	asm volatile(  		"	diag	%0,0,0x258\n"  		"0:\n" @@ -537,11 +642,15 @@ void pfault_fini(void)  		: : "a" (&refbk), "m" (refbk) : "cc");  } -static void pfault_interrupt(unsigned int ext_int_code, +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +static void pfault_interrupt(struct ext_code ext_code,  			     unsigned int param32, unsigned long param64)  {  	struct task_struct *tsk;  	__u16 subcode; +	pid_t pid;  	/*  	 * Get the external interruption subcode & pfault @@ -549,67 +658,118 @@ static void pfault_interrupt(unsigned int ext_int_code,  	 * in the 'cpu address' field associated with the           * external interrupt.   	 */ -	subcode = ext_int_code >> 16; +	subcode = ext_code.subcode;  	if ((subcode & 0xff00) != __SUBCODE_MASK)  		return; - -	/* -	 * Get the token (= address of the task structure of the affected task). -	 */ -#ifdef CONFIG_64BIT -	tsk = *(struct task_struct **) param64; -#else -	tsk = *(struct task_struct **) param32; -#endif - +	inc_irq_stat(IRQEXT_PFL); +	/* Get the token (= pid of the affected task). */ +	pid = sizeof(void *) == 4 ? param32 : param64; +	rcu_read_lock(); +	tsk = find_task_by_pid_ns(pid, &init_pid_ns); +	if (tsk) +		get_task_struct(tsk); +	rcu_read_unlock(); +	if (!tsk) +		return; +	spin_lock(&pfault_lock);  	if (subcode & 0x0080) {  		/* signal bit is set -> a page has been swapped in by VM */ -		if (xchg(&tsk->thread.pfault_wait, -1) != 0) { +		if (tsk->thread.pfault_wait == 1) {  			/* Initial interrupt was faster than the completion  			 * interrupt. pfault_wait is valid. Set pfault_wait  			 * back to zero and wake up the process. This can  			 * safely be done because the task is still sleeping  			 * and can't produce new pfaults. */  			tsk->thread.pfault_wait = 0; +			list_del(&tsk->thread.list);  			wake_up_process(tsk);  			put_task_struct(tsk); +		} else { +			/* Completion interrupt was faster than initial +			 * interrupt. Set pfault_wait to -1 so the initial +			 * interrupt doesn't put the task to sleep. +			 * If the task is not running, ignore the completion +			 * interrupt since it must be a leftover of a PFAULT +			 * CANCEL operation which didn't remove all pending +			 * completion interrupts. */ +			if (tsk->state == TASK_RUNNING) +				tsk->thread.pfault_wait = -1;  		}  	} else {  		/* signal bit not set -> a real page is missing. */ -		get_task_struct(tsk); -		set_task_state(tsk, TASK_UNINTERRUPTIBLE); -		if (xchg(&tsk->thread.pfault_wait, 1) != 0) { +		if (WARN_ON_ONCE(tsk != current)) +			goto out; +		if (tsk->thread.pfault_wait == 1) { +			/* Already on the list with a reference: put to sleep */ +			__set_task_state(tsk, TASK_UNINTERRUPTIBLE); +			set_tsk_need_resched(tsk); +		} else if (tsk->thread.pfault_wait == -1) {  			/* Completion interrupt was faster than the initial -			 * interrupt (swapped in a -1 for pfault_wait). Set -			 * pfault_wait back to zero and exit. This can be -			 * done safely because tsk is running in kernel  -			 * mode and can't produce new pfaults. */ +			 * interrupt (pfault_wait == -1). Set pfault_wait +			 * back to zero and exit. */  			tsk->thread.pfault_wait = 0; -			set_task_state(tsk, TASK_RUNNING); -			put_task_struct(tsk); -		} else +		} else { +			/* Initial interrupt arrived before completion +			 * interrupt. Let the task sleep. +			 * An extra task reference is needed since a different +			 * cpu may set the task state to TASK_RUNNING again +			 * before the scheduler is reached. */ +			get_task_struct(tsk); +			tsk->thread.pfault_wait = 1; +			list_add(&tsk->thread.list, &pfault_list); +			__set_task_state(tsk, TASK_UNINTERRUPTIBLE);  			set_tsk_need_resched(tsk); +		}  	} +out: +	spin_unlock(&pfault_lock); +	put_task_struct(tsk);  } -void __init pfault_irq_init(void) +static int pfault_cpu_notify(struct notifier_block *self, unsigned long action, +			     void *hcpu)  { -	if (!MACHINE_IS_VM) -		return; +	struct thread_struct *thread, *next; +	struct task_struct *tsk; -	/* -	 * Try to get pfault pseudo page faults going. -	 */ -	if (register_early_external_interrupt(0x2603, pfault_interrupt, -					      &ext_int_pfault) != 0) -		panic("Couldn't request external interrupt 0x2603"); +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DEAD: +		spin_lock_irq(&pfault_lock); +		list_for_each_entry_safe(thread, next, &pfault_list, list) { +			thread->pfault_wait = 0; +			list_del(&thread->list); +			tsk = container_of(thread, struct task_struct, thread); +			wake_up_process(tsk); +			put_task_struct(tsk); +		} +		spin_unlock_irq(&pfault_lock); +		break; +	default: +		break; +	} +	return NOTIFY_OK; +} -	if (pfault_init() == 0) -		return; +static int __init pfault_irq_init(void) +{ +	int rc; -	/* Tough luck, no pfault. */ +	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +	if (rc) +		goto out_extint; +	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; +	if (rc) +		goto out_pfault; +	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); +	hotcpu_notifier(pfault_cpu_notify, 0); +	return 0; + +out_pfault: +	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +out_extint:  	pfault_disable = 1; -	unregister_early_external_interrupt(0x2603, pfault_interrupt, -					    &ext_int_pfault); +	return rc;  } -#endif +early_initcall(pfault_irq_init); + +#endif /* CONFIG_PFAULT */  | 
