diff options
Diffstat (limited to 'arch/s390/mm/pgtable.c')
| -rw-r--r-- | arch/s390/mm/pgtable.c | 1590 | 
1 files changed, 1305 insertions, 285 deletions
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 0c719c61972..37b8241ec78 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -1,5 +1,5 @@  /* - *    Copyright IBM Corp. 2007,2009 + *    Copyright IBM Corp. 2007, 2011   *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>   */ @@ -16,188 +16,60 @@  #include <linux/module.h>  #include <linux/quicklist.h>  #include <linux/rcupdate.h> +#include <linux/slab.h> +#include <linux/swapops.h> -#include <asm/system.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h>  #include <asm/tlb.h>  #include <asm/tlbflush.h>  #include <asm/mmu_context.h> -struct rcu_table_freelist { -	struct rcu_head rcu; -	struct mm_struct *mm; -	unsigned int pgt_index; -	unsigned int crst_index; -	unsigned long *table[0]; -}; - -#define RCU_FREELIST_SIZE \ -	((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \ -	  / sizeof(unsigned long)) - -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist); - -static void __page_table_free(struct mm_struct *mm, unsigned long *table); -static void __crst_table_free(struct mm_struct *mm, unsigned long *table); - -static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm) -{ -	struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist); -	struct rcu_table_freelist *batch = *batchp; - -	if (batch) -		return batch; -	batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC); -	if (batch) { -		batch->mm = mm; -		batch->pgt_index = 0; -		batch->crst_index = RCU_FREELIST_SIZE; -		*batchp = batch; -	} -	return batch; -} - -static void rcu_table_freelist_callback(struct rcu_head *head) -{ -	struct rcu_table_freelist *batch = -		container_of(head, struct rcu_table_freelist, rcu); - -	while (batch->pgt_index > 0) -		__page_table_free(batch->mm, batch->table[--batch->pgt_index]); -	while (batch->crst_index < RCU_FREELIST_SIZE) -		__crst_table_free(batch->mm, batch->table[batch->crst_index++]); -	free_page((unsigned long) batch); -} - -void rcu_table_freelist_finish(void) -{ -	struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist); - -	if (!batch) -		return; -	call_rcu(&batch->rcu, rcu_table_freelist_callback); -	__get_cpu_var(rcu_table_freelist) = NULL; -} - -static void smp_sync(void *arg) -{ -} -  #ifndef CONFIG_64BIT  #define ALLOC_ORDER	1 -#define TABLES_PER_PAGE	4 -#define FRAG_MASK	15UL -#define SECOND_HALVES	10UL - -void clear_table_pgstes(unsigned long *table) -{ -	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); -	memset(table + 256, 0, PAGE_SIZE/4); -	clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); -	memset(table + 768, 0, PAGE_SIZE/4); -} - +#define FRAG_MASK	0x0f  #else  #define ALLOC_ORDER	2 -#define TABLES_PER_PAGE	2 -#define FRAG_MASK	3UL -#define SECOND_HALVES	2UL - -void clear_table_pgstes(unsigned long *table) -{ -	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); -	memset(table + 256, 0, PAGE_SIZE/2); -} - +#define FRAG_MASK	0x03  #endif -unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE; -EXPORT_SYMBOL(VMALLOC_START); -static int __init parse_vmalloc(char *arg) -{ -	if (!arg) -		return -EINVAL; -	VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK; -	return 0; -} -early_param("vmalloc", parse_vmalloc); - -unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) +unsigned long *crst_table_alloc(struct mm_struct *mm)  {  	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);  	if (!page)  		return NULL; -	page->index = 0; -	if (noexec) { -		struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER); -		if (!shadow) { -			__free_pages(page, ALLOC_ORDER); -			return NULL; -		} -		page->index = page_to_phys(shadow); -	} -	spin_lock_bh(&mm->context.list_lock); -	list_add(&page->lru, &mm->context.crst_list); -	spin_unlock_bh(&mm->context.list_lock);  	return (unsigned long *) page_to_phys(page);  } -static void __crst_table_free(struct mm_struct *mm, unsigned long *table) -{ -	unsigned long *shadow = get_shadow_table(table); - -	if (shadow) -		free_pages((unsigned long) shadow, ALLOC_ORDER); -	free_pages((unsigned long) table, ALLOC_ORDER); -} -  void crst_table_free(struct mm_struct *mm, unsigned long *table)  { -	struct page *page = virt_to_page(table); - -	spin_lock_bh(&mm->context.list_lock); -	list_del(&page->lru); -	spin_unlock_bh(&mm->context.list_lock); -	__crst_table_free(mm, table); +	free_pages((unsigned long) table, ALLOC_ORDER);  } -void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table) +#ifdef CONFIG_64BIT +static void __crst_table_upgrade(void *arg)  { -	struct rcu_table_freelist *batch; -	struct page *page = virt_to_page(table); +	struct mm_struct *mm = arg; -	spin_lock_bh(&mm->context.list_lock); -	list_del(&page->lru); -	spin_unlock_bh(&mm->context.list_lock); -	if (atomic_read(&mm->mm_users) < 2 && -	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { -		__crst_table_free(mm, table); -		return; +	if (current->active_mm == mm) { +		clear_user_asce(); +		set_user_asce(mm);  	} -	batch = rcu_table_freelist_get(mm); -	if (!batch) { -		smp_call_function(smp_sync, NULL, 1); -		__crst_table_free(mm, table); -		return; -	} -	batch->table[--batch->crst_index] = table; -	if (batch->pgt_index >= batch->crst_index) -		rcu_table_freelist_finish(); +	__tlb_flush_local();  } -#ifdef CONFIG_64BIT  int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)  {  	unsigned long *table, *pgd;  	unsigned long entry; +	int flush;  	BUG_ON(limit > (1UL << 53)); +	flush = 0;  repeat: -	table = crst_table_alloc(mm, mm->context.noexec); +	table = crst_table_alloc(mm);  	if (!table)  		return -ENOMEM;  	spin_lock_bh(&mm->page_table_lock); @@ -221,13 +93,15 @@ repeat:  		mm->pgd = (pgd_t *) table;  		mm->task_size = mm->context.asce_limit;  		table = NULL; +		flush = 1;  	}  	spin_unlock_bh(&mm->page_table_lock);  	if (table)  		crst_table_free(mm, table);  	if (mm->context.asce_limit < limit)  		goto repeat; -	update_mm(mm, current); +	if (flush) +		on_each_cpu(__crst_table_upgrade, mm, 0);  	return 0;  } @@ -235,9 +109,10 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)  {  	pgd_t *pgd; -	if (mm->context.asce_limit <= limit) -		return; -	__tlb_flush_mm(mm); +	if (current->active_mm == mm) { +		clear_user_asce(); +		__tlb_flush_mm(mm); +	}  	while (mm->context.asce_limit > limit) {  		pgd = mm->pgd;  		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { @@ -260,141 +135,1228 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)  		mm->task_size = mm->context.asce_limit;  		crst_table_free(mm, (unsigned long *) pgd);  	} -	update_mm(mm, current); +	if (current->active_mm == mm) +		set_user_asce(mm);  }  #endif +#ifdef CONFIG_PGSTE + +/** + * gmap_alloc - allocate a guest address space + * @mm: pointer to the parent mm_struct + * + * Returns a guest address space structure. + */ +struct gmap *gmap_alloc(struct mm_struct *mm) +{ +	struct gmap *gmap; +	struct page *page; +	unsigned long *table; + +	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); +	if (!gmap) +		goto out; +	INIT_LIST_HEAD(&gmap->crst_list); +	gmap->mm = mm; +	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); +	if (!page) +		goto out_free; +	list_add(&page->lru, &gmap->crst_list); +	table = (unsigned long *) page_to_phys(page); +	crst_table_init(table, _REGION1_ENTRY_EMPTY); +	gmap->table = table; +	gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | +		     _ASCE_USER_BITS | __pa(table); +	list_add(&gmap->list, &mm->context.gmap_list); +	return gmap; + +out_free: +	kfree(gmap); +out: +	return NULL; +} +EXPORT_SYMBOL_GPL(gmap_alloc); + +static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) +{ +	struct gmap_pgtable *mp; +	struct gmap_rmap *rmap; +	struct page *page; + +	if (*table & _SEGMENT_ENTRY_INVALID) +		return 0; +	page = pfn_to_page(*table >> PAGE_SHIFT); +	mp = (struct gmap_pgtable *) page->index; +	list_for_each_entry(rmap, &mp->mapper, list) { +		if (rmap->entry != table) +			continue; +		list_del(&rmap->list); +		kfree(rmap); +		break; +	} +	*table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT; +	return 1; +} + +static void gmap_flush_tlb(struct gmap *gmap) +{ +	if (MACHINE_HAS_IDTE) +		__tlb_flush_asce(gmap->mm, (unsigned long) gmap->table | +				 _ASCE_TYPE_REGION1); +	else +		__tlb_flush_global(); +} + +/** + * gmap_free - free a guest address space + * @gmap: pointer to the guest address space structure + */ +void gmap_free(struct gmap *gmap) +{ +	struct page *page, *next; +	unsigned long *table; +	int i; + + +	/* Flush tlb. */ +	if (MACHINE_HAS_IDTE) +		__tlb_flush_asce(gmap->mm, (unsigned long) gmap->table | +				 _ASCE_TYPE_REGION1); +	else +		__tlb_flush_global(); + +	/* Free all segment & region tables. */ +	down_read(&gmap->mm->mmap_sem); +	spin_lock(&gmap->mm->page_table_lock); +	list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { +		table = (unsigned long *) page_to_phys(page); +		if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) +			/* Remove gmap rmap structures for segment table. */ +			for (i = 0; i < PTRS_PER_PMD; i++, table++) +				gmap_unlink_segment(gmap, table); +		__free_pages(page, ALLOC_ORDER); +	} +	spin_unlock(&gmap->mm->page_table_lock); +	up_read(&gmap->mm->mmap_sem); +	list_del(&gmap->list); +	kfree(gmap); +} +EXPORT_SYMBOL_GPL(gmap_free); + +/** + * gmap_enable - switch primary space to the guest address space + * @gmap: pointer to the guest address space structure + */ +void gmap_enable(struct gmap *gmap) +{ +	S390_lowcore.gmap = (unsigned long) gmap; +} +EXPORT_SYMBOL_GPL(gmap_enable); + +/** + * gmap_disable - switch back to the standard primary address space + * @gmap: pointer to the guest address space structure + */ +void gmap_disable(struct gmap *gmap) +{ +	S390_lowcore.gmap = 0UL; +} +EXPORT_SYMBOL_GPL(gmap_disable); +  /* - * page table entry allocation/free routines. + * gmap_alloc_table is assumed to be called with mmap_sem held + */ +static int gmap_alloc_table(struct gmap *gmap, +			    unsigned long *table, unsigned long init) +	__releases(&gmap->mm->page_table_lock) +	__acquires(&gmap->mm->page_table_lock) +{ +	struct page *page; +	unsigned long *new; + +	/* since we dont free the gmap table until gmap_free we can unlock */ +	spin_unlock(&gmap->mm->page_table_lock); +	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); +	spin_lock(&gmap->mm->page_table_lock); +	if (!page) +		return -ENOMEM; +	new = (unsigned long *) page_to_phys(page); +	crst_table_init(new, init); +	if (*table & _REGION_ENTRY_INVALID) { +		list_add(&page->lru, &gmap->crst_list); +		*table = (unsigned long) new | _REGION_ENTRY_LENGTH | +			(*table & _REGION_ENTRY_TYPE_MASK); +	} else +		__free_pages(page, ALLOC_ORDER); +	return 0; +} + +/** + * gmap_unmap_segment - unmap segment from the guest address space + * @gmap: pointer to the guest address space structure + * @addr: address in the guest address space + * @len: length of the memory area to unmap + * + * Returns 0 if the unmap succeeded, -EINVAL if not. + */ +int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) +{ +	unsigned long *table; +	unsigned long off; +	int flush; + +	if ((to | len) & (PMD_SIZE - 1)) +		return -EINVAL; +	if (len == 0 || to + len < to) +		return -EINVAL; + +	flush = 0; +	down_read(&gmap->mm->mmap_sem); +	spin_lock(&gmap->mm->page_table_lock); +	for (off = 0; off < len; off += PMD_SIZE) { +		/* Walk the guest addr space page table */ +		table = gmap->table + (((to + off) >> 53) & 0x7ff); +		if (*table & _REGION_ENTRY_INVALID) +			goto out; +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		table = table + (((to + off) >> 42) & 0x7ff); +		if (*table & _REGION_ENTRY_INVALID) +			goto out; +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		table = table + (((to + off) >> 31) & 0x7ff); +		if (*table & _REGION_ENTRY_INVALID) +			goto out; +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		table = table + (((to + off) >> 20) & 0x7ff); + +		/* Clear segment table entry in guest address space. */ +		flush |= gmap_unlink_segment(gmap, table); +		*table = _SEGMENT_ENTRY_INVALID; +	} +out: +	spin_unlock(&gmap->mm->page_table_lock); +	up_read(&gmap->mm->mmap_sem); +	if (flush) +		gmap_flush_tlb(gmap); +	return 0; +} +EXPORT_SYMBOL_GPL(gmap_unmap_segment); + +/** + * gmap_mmap_segment - map a segment to the guest address space + * @gmap: pointer to the guest address space structure + * @from: source address in the parent address space + * @to: target address in the guest address space + * + * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. + */ +int gmap_map_segment(struct gmap *gmap, unsigned long from, +		     unsigned long to, unsigned long len) +{ +	unsigned long *table; +	unsigned long off; +	int flush; + +	if ((from | to | len) & (PMD_SIZE - 1)) +		return -EINVAL; +	if (len == 0 || from + len > TASK_MAX_SIZE || +	    from + len < from || to + len < to) +		return -EINVAL; + +	flush = 0; +	down_read(&gmap->mm->mmap_sem); +	spin_lock(&gmap->mm->page_table_lock); +	for (off = 0; off < len; off += PMD_SIZE) { +		/* Walk the gmap address space page table */ +		table = gmap->table + (((to + off) >> 53) & 0x7ff); +		if ((*table & _REGION_ENTRY_INVALID) && +		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) +			goto out_unmap; +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		table = table + (((to + off) >> 42) & 0x7ff); +		if ((*table & _REGION_ENTRY_INVALID) && +		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) +			goto out_unmap; +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		table = table + (((to + off) >> 31) & 0x7ff); +		if ((*table & _REGION_ENTRY_INVALID) && +		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) +			goto out_unmap; +		table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); +		table = table + (((to + off) >> 20) & 0x7ff); + +		/* Store 'from' address in an invalid segment table entry. */ +		flush |= gmap_unlink_segment(gmap, table); +		*table =  (from + off) | (_SEGMENT_ENTRY_INVALID | +					  _SEGMENT_ENTRY_PROTECT); +	} +	spin_unlock(&gmap->mm->page_table_lock); +	up_read(&gmap->mm->mmap_sem); +	if (flush) +		gmap_flush_tlb(gmap); +	return 0; + +out_unmap: +	spin_unlock(&gmap->mm->page_table_lock); +	up_read(&gmap->mm->mmap_sem); +	gmap_unmap_segment(gmap, to, len); +	return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gmap_map_segment); + +static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap) +{ +	unsigned long *table; + +	table = gmap->table + ((address >> 53) & 0x7ff); +	if (unlikely(*table & _REGION_ENTRY_INVALID)) +		return ERR_PTR(-EFAULT); +	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +	table = table + ((address >> 42) & 0x7ff); +	if (unlikely(*table & _REGION_ENTRY_INVALID)) +		return ERR_PTR(-EFAULT); +	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +	table = table + ((address >> 31) & 0x7ff); +	if (unlikely(*table & _REGION_ENTRY_INVALID)) +		return ERR_PTR(-EFAULT); +	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +	table = table + ((address >> 20) & 0x7ff); +	return table; +} + +/** + * __gmap_translate - translate a guest address to a user space address + * @address: guest address + * @gmap: pointer to guest mapping meta data structure + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + * The mmap_sem of the mm that belongs to the address space must be held + * when this function gets called. + */ +unsigned long __gmap_translate(unsigned long address, struct gmap *gmap) +{ +	unsigned long *segment_ptr, vmaddr, segment; +	struct gmap_pgtable *mp; +	struct page *page; + +	current->thread.gmap_addr = address; +	segment_ptr = gmap_table_walk(address, gmap); +	if (IS_ERR(segment_ptr)) +		return PTR_ERR(segment_ptr); +	/* Convert the gmap address to an mm address. */ +	segment = *segment_ptr; +	if (!(segment & _SEGMENT_ENTRY_INVALID)) { +		page = pfn_to_page(segment >> PAGE_SHIFT); +		mp = (struct gmap_pgtable *) page->index; +		return mp->vmaddr | (address & ~PMD_MASK); +	} else if (segment & _SEGMENT_ENTRY_PROTECT) { +		vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; +		return vmaddr | (address & ~PMD_MASK); +	} +	return -EFAULT; +} +EXPORT_SYMBOL_GPL(__gmap_translate); + +/** + * gmap_translate - translate a guest address to a user space address + * @address: guest address + * @gmap: pointer to guest mapping meta data structure + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + */ +unsigned long gmap_translate(unsigned long address, struct gmap *gmap) +{ +	unsigned long rc; + +	down_read(&gmap->mm->mmap_sem); +	rc = __gmap_translate(address, gmap); +	up_read(&gmap->mm->mmap_sem); +	return rc; +} +EXPORT_SYMBOL_GPL(gmap_translate); + +static int gmap_connect_pgtable(unsigned long address, unsigned long segment, +				unsigned long *segment_ptr, struct gmap *gmap) +{ +	unsigned long vmaddr; +	struct vm_area_struct *vma; +	struct gmap_pgtable *mp; +	struct gmap_rmap *rmap; +	struct mm_struct *mm; +	struct page *page; +	pgd_t *pgd; +	pud_t *pud; +	pmd_t *pmd; + +	mm = gmap->mm; +	vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; +	vma = find_vma(mm, vmaddr); +	if (!vma || vma->vm_start > vmaddr) +		return -EFAULT; +	/* Walk the parent mm page table */ +	pgd = pgd_offset(mm, vmaddr); +	pud = pud_alloc(mm, pgd, vmaddr); +	if (!pud) +		return -ENOMEM; +	pmd = pmd_alloc(mm, pud, vmaddr); +	if (!pmd) +		return -ENOMEM; +	if (!pmd_present(*pmd) && +	    __pte_alloc(mm, vma, pmd, vmaddr)) +		return -ENOMEM; +	/* large pmds cannot yet be handled */ +	if (pmd_large(*pmd)) +		return -EFAULT; +	/* pmd now points to a valid segment table entry. */ +	rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); +	if (!rmap) +		return -ENOMEM; +	/* Link gmap segment table entry location to page table. */ +	page = pmd_page(*pmd); +	mp = (struct gmap_pgtable *) page->index; +	rmap->gmap = gmap; +	rmap->entry = segment_ptr; +	rmap->vmaddr = address & PMD_MASK; +	spin_lock(&mm->page_table_lock); +	if (*segment_ptr == segment) { +		list_add(&rmap->list, &mp->mapper); +		/* Set gmap segment table entry to page table. */ +		*segment_ptr = pmd_val(*pmd) & PAGE_MASK; +		rmap = NULL; +	} +	spin_unlock(&mm->page_table_lock); +	kfree(rmap); +	return 0; +} + +static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table) +{ +	struct gmap_rmap *rmap, *next; +	struct gmap_pgtable *mp; +	struct page *page; +	int flush; + +	flush = 0; +	spin_lock(&mm->page_table_lock); +	page = pfn_to_page(__pa(table) >> PAGE_SHIFT); +	mp = (struct gmap_pgtable *) page->index; +	list_for_each_entry_safe(rmap, next, &mp->mapper, list) { +		*rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID | +					     _SEGMENT_ENTRY_PROTECT); +		list_del(&rmap->list); +		kfree(rmap); +		flush = 1; +	} +	spin_unlock(&mm->page_table_lock); +	if (flush) +		__tlb_flush_global(); +} + +/* + * this function is assumed to be called with mmap_sem held + */ +unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) +{ +	unsigned long *segment_ptr, segment; +	struct gmap_pgtable *mp; +	struct page *page; +	int rc; + +	current->thread.gmap_addr = address; +	segment_ptr = gmap_table_walk(address, gmap); +	if (IS_ERR(segment_ptr)) +		return -EFAULT; +	/* Convert the gmap address to an mm address. */ +	while (1) { +		segment = *segment_ptr; +		if (!(segment & _SEGMENT_ENTRY_INVALID)) { +			/* Page table is present */ +			page = pfn_to_page(segment >> PAGE_SHIFT); +			mp = (struct gmap_pgtable *) page->index; +			return mp->vmaddr | (address & ~PMD_MASK); +		} +		if (!(segment & _SEGMENT_ENTRY_PROTECT)) +			/* Nothing mapped in the gmap address space. */ +			break; +		rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap); +		if (rc) +			return rc; +	} +	return -EFAULT; +} + +unsigned long gmap_fault(unsigned long address, struct gmap *gmap) +{ +	unsigned long rc; + +	down_read(&gmap->mm->mmap_sem); +	rc = __gmap_fault(address, gmap); +	up_read(&gmap->mm->mmap_sem); + +	return rc; +} +EXPORT_SYMBOL_GPL(gmap_fault); + +static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) +{ +	if (!non_swap_entry(entry)) +		dec_mm_counter(mm, MM_SWAPENTS); +	else if (is_migration_entry(entry)) { +		struct page *page = migration_entry_to_page(entry); + +		if (PageAnon(page)) +			dec_mm_counter(mm, MM_ANONPAGES); +		else +			dec_mm_counter(mm, MM_FILEPAGES); +	} +	free_swap_and_cache(entry); +} + +/** + * The mm->mmap_sem lock must be held + */ +static void gmap_zap_unused(struct mm_struct *mm, unsigned long address) +{ +	unsigned long ptev, pgstev; +	spinlock_t *ptl; +	pgste_t pgste; +	pte_t *ptep, pte; + +	ptep = get_locked_pte(mm, address, &ptl); +	if (unlikely(!ptep)) +		return; +	pte = *ptep; +	if (!pte_swap(pte)) +		goto out_pte; +	/* Zap unused and logically-zero pages */ +	pgste = pgste_get_lock(ptep); +	pgstev = pgste_val(pgste); +	ptev = pte_val(pte); +	if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || +	    ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { +		gmap_zap_swap_entry(pte_to_swp_entry(pte), mm); +		pte_clear(mm, address, ptep); +	} +	pgste_set_unlock(ptep, pgste); +out_pte: +	pte_unmap_unlock(*ptep, ptl); +} + +/* + * this function is assumed to be called with mmap_sem held + */ +void __gmap_zap(unsigned long address, struct gmap *gmap) +{ +	unsigned long *table, *segment_ptr; +	unsigned long segment, pgstev, ptev; +	struct gmap_pgtable *mp; +	struct page *page; + +	segment_ptr = gmap_table_walk(address, gmap); +	if (IS_ERR(segment_ptr)) +		return; +	segment = *segment_ptr; +	if (segment & _SEGMENT_ENTRY_INVALID) +		return; +	page = pfn_to_page(segment >> PAGE_SHIFT); +	mp = (struct gmap_pgtable *) page->index; +	address = mp->vmaddr | (address & ~PMD_MASK); +	/* Page table is present */ +	table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN); +	table = table + ((address >> 12) & 0xff); +	pgstev = table[PTRS_PER_PTE]; +	ptev = table[0]; +	/* quick check, checked again with locks held */ +	if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || +	    ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) +		gmap_zap_unused(gmap->mm, address); +} +EXPORT_SYMBOL_GPL(__gmap_zap); + +void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) +{ + +	unsigned long *table, address, size; +	struct vm_area_struct *vma; +	struct gmap_pgtable *mp; +	struct page *page; + +	down_read(&gmap->mm->mmap_sem); +	address = from; +	while (address < to) { +		/* Walk the gmap address space page table */ +		table = gmap->table + ((address >> 53) & 0x7ff); +		if (unlikely(*table & _REGION_ENTRY_INVALID)) { +			address = (address + PMD_SIZE) & PMD_MASK; +			continue; +		} +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		table = table + ((address >> 42) & 0x7ff); +		if (unlikely(*table & _REGION_ENTRY_INVALID)) { +			address = (address + PMD_SIZE) & PMD_MASK; +			continue; +		} +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		table = table + ((address >> 31) & 0x7ff); +		if (unlikely(*table & _REGION_ENTRY_INVALID)) { +			address = (address + PMD_SIZE) & PMD_MASK; +			continue; +		} +		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); +		table = table + ((address >> 20) & 0x7ff); +		if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) { +			address = (address + PMD_SIZE) & PMD_MASK; +			continue; +		} +		page = pfn_to_page(*table >> PAGE_SHIFT); +		mp = (struct gmap_pgtable *) page->index; +		vma = find_vma(gmap->mm, mp->vmaddr); +		size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); +		zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), +			       size, NULL); +		address = (address + PMD_SIZE) & PMD_MASK; +	} +	up_read(&gmap->mm->mmap_sem); +} +EXPORT_SYMBOL_GPL(gmap_discard); + +static LIST_HEAD(gmap_notifier_list); +static DEFINE_SPINLOCK(gmap_notifier_lock); + +/** + * gmap_register_ipte_notifier - register a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_register_ipte_notifier(struct gmap_notifier *nb) +{ +	spin_lock(&gmap_notifier_lock); +	list_add(&nb->list, &gmap_notifier_list); +	spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); + +/** + * gmap_unregister_ipte_notifier - remove a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) +{ +	spin_lock(&gmap_notifier_lock); +	list_del_init(&nb->list); +	spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); + +/** + * gmap_ipte_notify - mark a range of ptes for invalidation notification + * @gmap: pointer to guest mapping meta data structure + * @start: virtual address in the guest address space + * @len: size of area + * + * Returns 0 if for each page in the given range a gmap mapping exists and + * the invalidation notification could be set. If the gmap mapping is missing + * for one or more pages -EFAULT is returned. If no memory could be allocated + * -ENOMEM is returned. This function establishes missing page table entries. + */ +int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) +{ +	unsigned long addr; +	spinlock_t *ptl; +	pte_t *ptep, entry; +	pgste_t pgste; +	int rc = 0; + +	if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK)) +		return -EINVAL; +	down_read(&gmap->mm->mmap_sem); +	while (len) { +		/* Convert gmap address and connect the page tables */ +		addr = __gmap_fault(start, gmap); +		if (IS_ERR_VALUE(addr)) { +			rc = addr; +			break; +		} +		/* Get the page mapped */ +		if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { +			rc = -EFAULT; +			break; +		} +		/* Walk the process page table, lock and get pte pointer */ +		ptep = get_locked_pte(gmap->mm, addr, &ptl); +		if (unlikely(!ptep)) +			continue; +		/* Set notification bit in the pgste of the pte */ +		entry = *ptep; +		if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { +			pgste = pgste_get_lock(ptep); +			pgste_val(pgste) |= PGSTE_IN_BIT; +			pgste_set_unlock(ptep, pgste); +			start += PAGE_SIZE; +			len -= PAGE_SIZE; +		} +		spin_unlock(ptl); +	} +	up_read(&gmap->mm->mmap_sem); +	return rc; +} +EXPORT_SYMBOL_GPL(gmap_ipte_notify); + +/** + * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. + * @mm: pointer to the process mm_struct + * @pte: pointer to the page table entry + * + * This function is assumed to be called with the page table lock held + * for the pte to notify.   */ -unsigned long *page_table_alloc(struct mm_struct *mm) +void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte) +{ +	unsigned long segment_offset; +	struct gmap_notifier *nb; +	struct gmap_pgtable *mp; +	struct gmap_rmap *rmap; +	struct page *page; + +	segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); +	segment_offset = segment_offset * (4096 / sizeof(pte_t)); +	page = pfn_to_page(__pa(pte) >> PAGE_SHIFT); +	mp = (struct gmap_pgtable *) page->index; +	spin_lock(&gmap_notifier_lock); +	list_for_each_entry(rmap, &mp->mapper, list) { +		list_for_each_entry(nb, &gmap_notifier_list, list) +			nb->notifier_call(rmap->gmap, +					  rmap->vmaddr + segment_offset); +	} +	spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); + +static inline int page_table_with_pgste(struct page *page) +{ +	return atomic_read(&page->_mapcount) == 0; +} + +static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, +						    unsigned long vmaddr)  {  	struct page *page;  	unsigned long *table; -	unsigned long bits; +	struct gmap_pgtable *mp; -	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; +	page = alloc_page(GFP_KERNEL|__GFP_REPEAT); +	if (!page) +		return NULL; +	mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); +	if (!mp) { +		__free_page(page); +		return NULL; +	} +	if (!pgtable_page_ctor(page)) { +		kfree(mp); +		__free_page(page); +		return NULL; +	} +	mp->vmaddr = vmaddr & PMD_MASK; +	INIT_LIST_HEAD(&mp->mapper); +	page->index = (unsigned long) mp; +	atomic_set(&page->_mapcount, 0); +	table = (unsigned long *) page_to_phys(page); +	clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); +	clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); +	return table; +} + +static inline void page_table_free_pgste(unsigned long *table) +{ +	struct page *page; +	struct gmap_pgtable *mp; + +	page = pfn_to_page(__pa(table) >> PAGE_SHIFT); +	mp = (struct gmap_pgtable *) page->index; +	BUG_ON(!list_empty(&mp->mapper)); +	pgtable_page_dtor(page); +	atomic_set(&page->_mapcount, -1); +	kfree(mp); +	__free_page(page); +} + +static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t *pmd, +			unsigned long addr, unsigned long end, bool init_skey) +{ +	pte_t *start_pte, *pte; +	spinlock_t *ptl; +	pgste_t pgste; + +	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); +	pte = start_pte; +	do { +		pgste = pgste_get_lock(pte); +		pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; +		if (init_skey) { +			unsigned long address; + +			pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | +					      PGSTE_GR_BIT | PGSTE_GC_BIT); + +			/* skip invalid and not writable pages */ +			if (pte_val(*pte) & _PAGE_INVALID || +			    !(pte_val(*pte) & _PAGE_WRITE)) { +				pgste_set_unlock(pte, pgste); +				continue; +			} + +			address = pte_val(*pte) & PAGE_MASK; +			page_set_storage_key(address, PAGE_DEFAULT_KEY, 1); +		} +		pgste_set_unlock(pte, pgste); +	} while (pte++, addr += PAGE_SIZE, addr != end); +	pte_unmap_unlock(start_pte, ptl); + +	return addr; +} + +static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t *pud, +			unsigned long addr, unsigned long end, bool init_skey) +{ +	unsigned long next; +	pmd_t *pmd; + +	pmd = pmd_offset(pud, addr); +	do { +		next = pmd_addr_end(addr, end); +		if (pmd_none_or_clear_bad(pmd)) +			continue; +		next = page_table_reset_pte(mm, pmd, addr, next, init_skey); +	} while (pmd++, addr = next, addr != end); + +	return addr; +} + +static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t *pgd, +			unsigned long addr, unsigned long end, bool init_skey) +{ +	unsigned long next; +	pud_t *pud; + +	pud = pud_offset(pgd, addr); +	do { +		next = pud_addr_end(addr, end); +		if (pud_none_or_clear_bad(pud)) +			continue; +		next = page_table_reset_pmd(mm, pud, addr, next, init_skey); +	} while (pud++, addr = next, addr != end); + +	return addr; +} + +void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, +			    unsigned long end, bool init_skey) +{ +	unsigned long addr, next; +	pgd_t *pgd; + +	down_write(&mm->mmap_sem); +	if (init_skey && mm_use_skey(mm)) +		goto out_up; +	addr = start; +	pgd = pgd_offset(mm, addr); +	do { +		next = pgd_addr_end(addr, end); +		if (pgd_none_or_clear_bad(pgd)) +			continue; +		next = page_table_reset_pud(mm, pgd, addr, next, init_skey); +	} while (pgd++, addr = next, addr != end); +	if (init_skey) +		current->mm->context.use_skey = 1; +out_up: +	up_write(&mm->mmap_sem); +} +EXPORT_SYMBOL(page_table_reset_pgste); + +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, +			  unsigned long key, bool nq) +{ +	spinlock_t *ptl; +	pgste_t old, new; +	pte_t *ptep; + +	down_read(&mm->mmap_sem); +	ptep = get_locked_pte(current->mm, addr, &ptl); +	if (unlikely(!ptep)) { +		up_read(&mm->mmap_sem); +		return -EFAULT; +	} + +	new = old = pgste_get_lock(ptep); +	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | +			    PGSTE_ACC_BITS | PGSTE_FP_BIT); +	pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; +	pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; +	if (!(pte_val(*ptep) & _PAGE_INVALID)) { +		unsigned long address, bits, skey; + +		address = pte_val(*ptep) & PAGE_MASK; +		skey = (unsigned long) page_get_storage_key(address); +		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); +		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); +		/* Set storage key ACC and FP */ +		page_set_storage_key(address, skey, !nq); +		/* Merge host changed & referenced into pgste  */ +		pgste_val(new) |= bits << 52; +	} +	/* changing the guest storage key is considered a change of the page */ +	if ((pgste_val(new) ^ pgste_val(old)) & +	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) +		pgste_val(new) |= PGSTE_UC_BIT; + +	pgste_set_unlock(ptep, new); +	pte_unmap_unlock(*ptep, ptl); +	up_read(&mm->mmap_sem); +	return 0; +} +EXPORT_SYMBOL(set_guest_storage_key); + +#else /* CONFIG_PGSTE */ + +static inline int page_table_with_pgste(struct page *page) +{ +	return 0; +} + +static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, +						    unsigned long vmaddr) +{ +	return NULL; +} + +void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, +			    unsigned long end, bool init_skey) +{ +} + +static inline void page_table_free_pgste(unsigned long *table) +{ +} + +static inline void gmap_disconnect_pgtable(struct mm_struct *mm, +					   unsigned long *table) +{ +} + +#endif /* CONFIG_PGSTE */ + +static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) +{ +	unsigned int old, new; + +	do { +		old = atomic_read(v); +		new = old ^ bits; +	} while (atomic_cmpxchg(v, old, new) != old); +	return new; +} + +/* + * page table entry allocation/free routines. + */ +unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) +{ +	unsigned long *uninitialized_var(table); +	struct page *uninitialized_var(page); +	unsigned int mask, bit; + +	if (mm_has_pgste(mm)) +		return page_table_alloc_pgste(mm, vmaddr); +	/* Allocate fragments of a 4K page as 1K/2K page table */  	spin_lock_bh(&mm->context.list_lock); -	page = NULL; +	mask = FRAG_MASK;  	if (!list_empty(&mm->context.pgtable_list)) {  		page = list_first_entry(&mm->context.pgtable_list,  					struct page, lru); -		if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) -			page = NULL; +		table = (unsigned long *) page_to_phys(page); +		mask = atomic_read(&page->_mapcount); +		mask = mask | (mask >> 4);  	} -	if (!page) { +	if ((mask & FRAG_MASK) == FRAG_MASK) {  		spin_unlock_bh(&mm->context.list_lock);  		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);  		if (!page)  			return NULL; -		pgtable_page_ctor(page); -		page->flags &= ~FRAG_MASK; +		if (!pgtable_page_ctor(page)) { +			__free_page(page); +			return NULL; +		} +		atomic_set(&page->_mapcount, 1);  		table = (unsigned long *) page_to_phys(page); -		if (mm->context.has_pgste) -			clear_table_pgstes(table); -		else -			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); +		clear_table(table, _PAGE_INVALID, PAGE_SIZE);  		spin_lock_bh(&mm->context.list_lock);  		list_add(&page->lru, &mm->context.pgtable_list); +	} else { +		for (bit = 1; mask & bit; bit <<= 1) +			table += PTRS_PER_PTE; +		mask = atomic_xor_bits(&page->_mapcount, bit); +		if ((mask & FRAG_MASK) == FRAG_MASK) +			list_del(&page->lru);  	} -	table = (unsigned long *) page_to_phys(page); -	while (page->flags & bits) { -		table += 256; -		bits <<= 1; -	} -	page->flags |= bits; -	if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) -		list_move_tail(&page->lru, &mm->context.pgtable_list);  	spin_unlock_bh(&mm->context.list_lock);  	return table;  } -static void __page_table_free(struct mm_struct *mm, unsigned long *table) +void page_table_free(struct mm_struct *mm, unsigned long *table)  {  	struct page *page; -	unsigned long bits; +	unsigned int bit, mask; -	bits = ((unsigned long) table) & 15; -	table = (unsigned long *)(((unsigned long) table) ^ bits);  	page = pfn_to_page(__pa(table) >> PAGE_SHIFT); -	page->flags ^= bits; -	if (!(page->flags & FRAG_MASK)) { +	if (page_table_with_pgste(page)) { +		gmap_disconnect_pgtable(mm, table); +		return page_table_free_pgste(table); +	} +	/* Free 1K/2K page table fragment of a 4K page */ +	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); +	spin_lock_bh(&mm->context.list_lock); +	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) +		list_del(&page->lru); +	mask = atomic_xor_bits(&page->_mapcount, bit); +	if (mask & FRAG_MASK) +		list_add(&page->lru, &mm->context.pgtable_list); +	spin_unlock_bh(&mm->context.list_lock); +	if (mask == 0) {  		pgtable_page_dtor(page); +		atomic_set(&page->_mapcount, -1);  		__free_page(page);  	}  } -void page_table_free(struct mm_struct *mm, unsigned long *table) +static void __page_table_free_rcu(void *table, unsigned bit)  {  	struct page *page; -	unsigned long bits; -	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; -	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); +	if (bit == FRAG_MASK) +		return page_table_free_pgste(table); +	/* Free 1K/2K page table fragment of a 4K page */  	page = pfn_to_page(__pa(table) >> PAGE_SHIFT); -	spin_lock_bh(&mm->context.list_lock); -	page->flags ^= bits; -	if (page->flags & FRAG_MASK) { -		/* Page now has some free pgtable fragments. */ -		list_move(&page->lru, &mm->context.pgtable_list); -		page = NULL; -	} else -		/* All fragments of the 4K page have been freed. */ -		list_del(&page->lru); -	spin_unlock_bh(&mm->context.list_lock); -	if (page) { +	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {  		pgtable_page_dtor(page); +		atomic_set(&page->_mapcount, -1);  		__free_page(page);  	}  } -void page_table_free_rcu(struct mm_struct *mm, unsigned long *table) +void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)  { -	struct rcu_table_freelist *batch; +	struct mm_struct *mm;  	struct page *page; -	unsigned long bits; +	unsigned int bit, mask; -	if (atomic_read(&mm->mm_users) < 2 && -	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { -		page_table_free(mm, table); -		return; -	} -	batch = rcu_table_freelist_get(mm); -	if (!batch) { -		smp_call_function(smp_sync, NULL, 1); -		page_table_free(mm, table); +	mm = tlb->mm; +	page = pfn_to_page(__pa(table) >> PAGE_SHIFT); +	if (page_table_with_pgste(page)) { +		gmap_disconnect_pgtable(mm, table); +		table = (unsigned long *) (__pa(table) | FRAG_MASK); +		tlb_remove_table(tlb, table);  		return;  	} -	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; -	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); -	page = pfn_to_page(__pa(table) >> PAGE_SHIFT); +	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));  	spin_lock_bh(&mm->context.list_lock); -	/* Delayed freeing with rcu prevents reuse of pgtable fragments */ -	list_del_init(&page->lru); +	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) +		list_del(&page->lru); +	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); +	if (mask & FRAG_MASK) +		list_add_tail(&page->lru, &mm->context.pgtable_list);  	spin_unlock_bh(&mm->context.list_lock); -	table = (unsigned long *)(((unsigned long) table) | bits); -	batch->table[batch->pgt_index++] = table; -	if (batch->pgt_index >= batch->crst_index) -		rcu_table_freelist_finish(); +	table = (unsigned long *) (__pa(table) | (bit << 4)); +	tlb_remove_table(tlb, table); +} + +static void __tlb_remove_table(void *_table) +{ +	const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; +	void *table = (void *)((unsigned long) _table & ~mask); +	unsigned type = (unsigned long) _table & mask; + +	if (type) +		__page_table_free_rcu(table, type); +	else +		free_pages((unsigned long) table, ALLOC_ORDER); +} + +static void tlb_remove_table_smp_sync(void *arg) +{ +	/* Simply deliver the interrupt */ +} + +static void tlb_remove_table_one(void *table) +{ +	/* +	 * This isn't an RCU grace period and hence the page-tables cannot be +	 * assumed to be actually RCU-freed. +	 * +	 * It is however sufficient for software page-table walkers that rely +	 * on IRQ disabling. See the comment near struct mmu_table_batch. +	 */ +	smp_call_function(tlb_remove_table_smp_sync, NULL, 1); +	__tlb_remove_table(table); +} + +static void tlb_remove_table_rcu(struct rcu_head *head) +{ +	struct mmu_table_batch *batch; +	int i; + +	batch = container_of(head, struct mmu_table_batch, rcu); + +	for (i = 0; i < batch->nr; i++) +		__tlb_remove_table(batch->tables[i]); + +	free_page((unsigned long)batch);  } -void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) +void tlb_table_flush(struct mmu_gather *tlb)  { +	struct mmu_table_batch **batch = &tlb->batch; + +	if (*batch) { +		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); +		*batch = NULL; +	} +} + +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ +	struct mmu_table_batch **batch = &tlb->batch; + +	tlb->mm->context.flush_mm = 1; +	if (*batch == NULL) { +		*batch = (struct mmu_table_batch *) +			__get_free_page(GFP_NOWAIT | __GFP_NOWARN); +		if (*batch == NULL) { +			__tlb_flush_mm_lazy(tlb->mm); +			tlb_remove_table_one(table); +			return; +		} +		(*batch)->nr = 0; +	} +	(*batch)->tables[(*batch)->nr++] = table; +	if ((*batch)->nr == MAX_TABLE_BATCH) +		tlb_flush_mmu(tlb); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void thp_split_vma(struct vm_area_struct *vma) +{ +	unsigned long addr; + +	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) +		follow_page(vma, addr, FOLL_SPLIT); +} + +static inline void thp_split_mm(struct mm_struct *mm) +{ +	struct vm_area_struct *vma; + +	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { +		thp_split_vma(vma); +		vma->vm_flags &= ~VM_HUGEPAGE; +		vma->vm_flags |= VM_NOHUGEPAGE; +	} +	mm->def_flags |= VM_NOHUGEPAGE; +} +#else +static inline void thp_split_mm(struct mm_struct *mm) +{ +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, +				struct mm_struct *mm, pud_t *pud, +				unsigned long addr, unsigned long end) +{ +	unsigned long next, *table, *new;  	struct page *page; +	pmd_t *pmd; -	spin_lock_bh(&mm->context.list_lock); -	/* Free shadow region and segment tables. */ -	list_for_each_entry(page, &mm->context.crst_list, lru) -		if (page->index) { -			free_pages((unsigned long) page->index, ALLOC_ORDER); -			page->index = 0; +	pmd = pmd_offset(pud, addr); +	do { +		next = pmd_addr_end(addr, end); +again: +		if (pmd_none_or_clear_bad(pmd)) +			continue; +		table = (unsigned long *) pmd_deref(*pmd); +		page = pfn_to_page(__pa(table) >> PAGE_SHIFT); +		if (page_table_with_pgste(page)) +			continue; +		/* Allocate new page table with pgstes */ +		new = page_table_alloc_pgste(mm, addr); +		if (!new) +			return -ENOMEM; + +		spin_lock(&mm->page_table_lock); +		if (likely((unsigned long *) pmd_deref(*pmd) == table)) { +			/* Nuke pmd entry pointing to the "short" page table */ +			pmdp_flush_lazy(mm, addr, pmd); +			pmd_clear(pmd); +			/* Copy ptes from old table to new table */ +			memcpy(new, table, PAGE_SIZE/2); +			clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); +			/* Establish new table */ +			pmd_populate(mm, pmd, (pte_t *) new); +			/* Free old table with rcu, there might be a walker! */ +			page_table_free_rcu(tlb, table); +			new = NULL;  		} -	/* "Free" second halves of page tables. */ -	list_for_each_entry(page, &mm->context.pgtable_list, lru) -		page->flags &= ~SECOND_HALVES; -	spin_unlock_bh(&mm->context.list_lock); -	mm->context.noexec = 0; -	update_mm(mm, tsk); +		spin_unlock(&mm->page_table_lock); +		if (new) { +			page_table_free_pgste(new); +			goto again; +		} +	} while (pmd++, addr = next, addr != end); + +	return addr; +} + +static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, +				   struct mm_struct *mm, pgd_t *pgd, +				   unsigned long addr, unsigned long end) +{ +	unsigned long next; +	pud_t *pud; + +	pud = pud_offset(pgd, addr); +	do { +		next = pud_addr_end(addr, end); +		if (pud_none_or_clear_bad(pud)) +			continue; +		next = page_table_realloc_pmd(tlb, mm, pud, addr, next); +		if (unlikely(IS_ERR_VALUE(next))) +			return next; +	} while (pud++, addr = next, addr != end); + +	return addr; +} + +static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, +					unsigned long addr, unsigned long end) +{ +	unsigned long next; +	pgd_t *pgd; + +	pgd = pgd_offset(mm, addr); +	do { +		next = pgd_addr_end(addr, end); +		if (pgd_none_or_clear_bad(pgd)) +			continue; +		next = page_table_realloc_pud(tlb, mm, pgd, addr, next); +		if (unlikely(IS_ERR_VALUE(next))) +			return next; +	} while (pgd++, addr = next, addr != end); + +	return 0;  }  /* @@ -403,74 +1365,132 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)  int s390_enable_sie(void)  {  	struct task_struct *tsk = current; -	struct mm_struct *mm, *old_mm; - -	/* Do we have switched amode? If no, we cannot do sie */ -	if (user_mode == HOME_SPACE_MODE) -		return -EINVAL; +	struct mm_struct *mm = tsk->mm; +	struct mmu_gather tlb;  	/* Do we have pgstes? if yes, we are done */ -	if (tsk->mm->context.has_pgste) +	if (mm_has_pgste(tsk->mm))  		return 0; -	/* lets check if we are allowed to replace the mm */ -	task_lock(tsk); -	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || -#ifdef CONFIG_AIO -	    !hlist_empty(&tsk->mm->ioctx_list) || -#endif -	    tsk->mm != tsk->active_mm) { -		task_unlock(tsk); -		return -EINVAL; -	} -	task_unlock(tsk); +	down_write(&mm->mmap_sem); +	/* split thp mappings and disable thp for future mappings */ +	thp_split_mm(mm); +	/* Reallocate the page tables with pgstes */ +	tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE); +	if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE)) +		mm->context.has_pgste = 1; +	tlb_finish_mmu(&tlb, 0, TASK_SIZE); +	up_write(&mm->mmap_sem); +	return mm->context.has_pgste ? 0 : -ENOMEM; +} +EXPORT_SYMBOL_GPL(s390_enable_sie); -	/* we copy the mm and let dup_mm create the page tables with_pgstes */ -	tsk->mm->context.alloc_pgste = 1; -	mm = dup_mm(tsk); -	tsk->mm->context.alloc_pgste = 0; -	if (!mm) -		return -ENOMEM; +/* + * Enable storage key handling from now on and initialize the storage + * keys with the default key. + */ +void s390_enable_skey(void) +{ +	page_table_reset_pgste(current->mm, 0, TASK_SIZE, true); +} +EXPORT_SYMBOL_GPL(s390_enable_skey); -	/* Now lets check again if something happened */ -	task_lock(tsk); -	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || -#ifdef CONFIG_AIO -	    !hlist_empty(&tsk->mm->ioctx_list) || -#endif -	    tsk->mm != tsk->active_mm) { -		mmput(mm); -		task_unlock(tsk); -		return -EINVAL; +/* + * Test and reset if a guest page is dirty + */ +bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) +{ +	pte_t *pte; +	spinlock_t *ptl; +	bool dirty = false; + +	pte = get_locked_pte(gmap->mm, address, &ptl); +	if (unlikely(!pte)) +		return false; + +	if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) +		dirty = true; + +	spin_unlock(ptl); +	return dirty; +} +EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, +			   pmd_t *pmdp) +{ +	VM_BUG_ON(address & ~HPAGE_PMD_MASK); +	/* No need to flush TLB +	 * On s390 reference bits are in storage key and never in TLB */ +	return pmdp_test_and_clear_young(vma, address, pmdp); +} + +int pmdp_set_access_flags(struct vm_area_struct *vma, +			  unsigned long address, pmd_t *pmdp, +			  pmd_t entry, int dirty) +{ +	VM_BUG_ON(address & ~HPAGE_PMD_MASK); + +	if (pmd_same(*pmdp, entry)) +		return 0; +	pmdp_invalidate(vma, address, pmdp); +	set_pmd_at(vma->vm_mm, address, pmdp, entry); +	return 1; +} + +static void pmdp_splitting_flush_sync(void *arg) +{ +	/* Simply deliver the interrupt */ +} + +void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, +			  pmd_t *pmdp) +{ +	VM_BUG_ON(address & ~HPAGE_PMD_MASK); +	if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, +			      (unsigned long *) pmdp)) { +		/* need to serialize against gup-fast (IRQ disabled) */ +		smp_call_function(pmdp_splitting_flush_sync, NULL, 1);  	} +} -	/* ok, we are alone. No ptrace, no threads, etc. */ -	old_mm = tsk->mm; -	tsk->mm = tsk->active_mm = mm; -	preempt_disable(); -	update_mm(mm, tsk); -	atomic_inc(&mm->context.attach_count); -	atomic_dec(&old_mm->context.attach_count); -	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); -	preempt_enable(); -	task_unlock(tsk); -	mmput(old_mm); -	return 0; +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, +				pgtable_t pgtable) +{ +	struct list_head *lh = (struct list_head *) pgtable; + +	assert_spin_locked(pmd_lockptr(mm, pmdp)); + +	/* FIFO */ +	if (!pmd_huge_pte(mm, pmdp)) +		INIT_LIST_HEAD(lh); +	else +		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); +	pmd_huge_pte(mm, pmdp) = pgtable;  } -EXPORT_SYMBOL_GPL(s390_enable_sie); -#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION) -bool kernel_page_present(struct page *page) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)  { -	unsigned long addr; -	int cc; +	struct list_head *lh; +	pgtable_t pgtable; +	pte_t *ptep; + +	assert_spin_locked(pmd_lockptr(mm, pmdp)); -	addr = page_to_phys(page); -	asm volatile( -		"	lra	%1,0(%1)\n" -		"	ipm	%0\n" -		"	srl	%0,28" -		: "=d" (cc), "+a" (addr) : : "cc"); -	return cc == 0; +	/* FIFO */ +	pgtable = pmd_huge_pte(mm, pmdp); +	lh = (struct list_head *) pgtable; +	if (list_empty(lh)) +		pmd_huge_pte(mm, pmdp) = NULL; +	else { +		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; +		list_del(lh); +	} +	ptep = (pte_t *) pgtable; +	pte_val(*ptep) = _PAGE_INVALID; +	ptep++; +	pte_val(*ptep) = _PAGE_INVALID; +	return pgtable;  } -#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */  | 
