diff options
Diffstat (limited to 'fs/proc/task_mmu.c')
| -rw-r--r-- | fs/proc/task_mmu.c | 1058 | 
1 files changed, 883 insertions, 175 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index da6b01d70f0..cfa63ee92c9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,5 +1,7 @@  #include <linux/mm.h> +#include <linux/vmacache.h>  #include <linux/hugetlb.h> +#include <linux/huge_mm.h>  #include <linux/mount.h>  #include <linux/seq_file.h>  #include <linux/highmem.h> @@ -7,8 +9,10 @@  #include <linux/slab.h>  #include <linux/pagemap.h>  #include <linux/mempolicy.h> +#include <linux/rmap.h>  #include <linux/swap.h>  #include <linux/swapops.h> +#include <linux/mmu_notifier.h>  #include <asm/elf.h>  #include <asm/uaccess.h> @@ -42,6 +46,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)  		"VmPeak:\t%8lu kB\n"  		"VmSize:\t%8lu kB\n"  		"VmLck:\t%8lu kB\n" +		"VmPin:\t%8lu kB\n"  		"VmHWM:\t%8lu kB\n"  		"VmRSS:\t%8lu kB\n"  		"VmData:\t%8lu kB\n" @@ -51,13 +56,15 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)  		"VmPTE:\t%8lu kB\n"  		"VmSwap:\t%8lu kB\n",  		hiwater_vm << (PAGE_SHIFT-10), -		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), +		total_vm << (PAGE_SHIFT-10),  		mm->locked_vm << (PAGE_SHIFT-10), +		mm->pinned_vm << (PAGE_SHIFT-10),  		hiwater_rss << (PAGE_SHIFT-10),  		total_rss << (PAGE_SHIFT-10),  		data << (PAGE_SHIFT-10),  		mm->stack_vm << (PAGE_SHIFT-10), text, lib, -		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, +		(PTRS_PER_PTE * sizeof(pte_t) * +		 atomic_long_read(&mm->nr_ptes)) >> 10,  		swap << (PAGE_SHIFT-10));  } @@ -66,8 +73,9 @@ unsigned long task_vsize(struct mm_struct *mm)  	return PAGE_SIZE * mm->total_vm;  } -int task_statm(struct mm_struct *mm, int *shared, int *text, -	       int *data, int *resident) +unsigned long task_statm(struct mm_struct *mm, +			 unsigned long *shared, unsigned long *text, +			 unsigned long *data, unsigned long *resident)  {  	*shared = get_mm_counter(mm, MM_FILEPAGES);  	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) @@ -77,18 +85,55 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,  	return mm->total_vm;  } -static void pad_len_spaces(struct seq_file *m, int len) +#ifdef CONFIG_NUMA +/* + * These functions are for numa_maps but called in generic **maps seq_file + * ->start(), ->stop() ops. + * + * numa_maps scans all vmas under mmap_sem and checks their mempolicy. + * Each mempolicy object is controlled by reference counting. The problem here + * is how to avoid accessing dead mempolicy object. + * + * Because we're holding mmap_sem while reading seq_file, it's safe to access + * each vma's mempolicy, no vma objects will never drop refs to mempolicy. + * + * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy + * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). + * So, without task_lock(), we cannot trust get_vma_policy() because we cannot + * gurantee the task never exits under us. But taking task_lock() around + * get_vma_plicy() causes lock order problem. + * + * To access task->mempolicy without lock, we hold a reference count of an + * object pointed by task->mempolicy and remember it. This will guarantee + * that task->mempolicy points to an alive object or NULL in numa_maps accesses. + */ +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ +	struct task_struct *task = priv->task; + +	task_lock(task); +	priv->task_mempolicy = task->mempolicy; +	mpol_get(priv->task_mempolicy); +	task_unlock(task); +} +static void release_task_mempolicy(struct proc_maps_private *priv)  { -	len = 25 + sizeof(void*) * 6 - len; -	if (len < 1) -		len = 1; -	seq_printf(m, "%*c", len, ' '); +	mpol_put(priv->task_mempolicy);  } +#else +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ +} +#endif  static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)  {  	if (vma && vma != priv->tail_vma) {  		struct mm_struct *mm = vma->vm_mm; +		release_task_mempolicy(priv);  		up_read(&mm->mmap_sem);  		mmput(mm);  	} @@ -108,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)  	/*  	 * We remember last_addr rather than next_addr to hit with -	 * mmap_cache most of the time. We have zero last_addr at +	 * vmacache most of the time. We have zero last_addr at  	 * the beginning and also after lseek. We will have -1 last_addr  	 * after the end of the vmas.  	 */ @@ -118,16 +163,16 @@ static void *m_start(struct seq_file *m, loff_t *pos)  	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);  	if (!priv->task) -		return NULL; +		return ERR_PTR(-ESRCH); -	mm = mm_for_maps(priv->task); -	if (!mm) -		return NULL; +	mm = mm_access(priv->task, PTRACE_MODE_READ); +	if (!mm || IS_ERR(mm)) +		return mm;  	down_read(&mm->mmap_sem); -	tail_vma = get_gate_vma(priv->task); +	tail_vma = get_gate_vma(priv->task->mm);  	priv->tail_vma = tail_vma; - +	hold_task_mempolicy(priv);  	/* Start with last addr hint */  	vma = find_vma(mm, last_addr);  	if (last_addr && vma) { @@ -154,6 +199,7 @@ out:  	if (vma)  		return vma; +	release_task_mempolicy(priv);  	/* End of vmas has been reached */  	m->version = (tail_vma != NULL)? 0: -1UL;  	up_read(&mm->mmap_sem); @@ -179,7 +225,8 @@ static void m_stop(struct seq_file *m, void *v)  	struct proc_maps_private *priv = m->private;  	struct vm_area_struct *vma = v; -	vma_stop(priv, vma); +	if (!IS_ERR(vma)) +		vma_stop(priv, vma);  	if (priv->task)  		put_task_struct(priv->task);  } @@ -203,19 +250,22 @@ static int do_maps_open(struct inode *inode, struct file *file,  	return ret;  } -static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) +static void +show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)  {  	struct mm_struct *mm = vma->vm_mm;  	struct file *file = vma->vm_file; -	int flags = vma->vm_flags; +	struct proc_maps_private *priv = m->private; +	struct task_struct *task = priv->task; +	vm_flags_t flags = vma->vm_flags;  	unsigned long ino = 0;  	unsigned long long pgoff = 0; -	unsigned long start; +	unsigned long start, end;  	dev_t dev = 0; -	int len; +	const char *name = NULL;  	if (file) { -		struct inode *inode = vma->vm_file->f_path.dentry->d_inode; +		struct inode *inode = file_inode(vma->vm_file);  		dev = inode->i_sb->s_dev;  		ino = inode->i_ino;  		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; @@ -223,77 +273,137 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)  	/* We don't show the stack guard page in /proc/maps */  	start = vma->vm_start; -	if (vma->vm_flags & VM_GROWSDOWN) -		if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) -			start += PAGE_SIZE; - -	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", +	if (stack_guard_page_start(vma, start)) +		start += PAGE_SIZE; +	end = vma->vm_end; +	if (stack_guard_page_end(vma, end)) +		end -= PAGE_SIZE; + +	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); +	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",  			start, -			vma->vm_end, +			end,  			flags & VM_READ ? 'r' : '-',  			flags & VM_WRITE ? 'w' : '-',  			flags & VM_EXEC ? 'x' : '-',  			flags & VM_MAYSHARE ? 's' : 'p',  			pgoff, -			MAJOR(dev), MINOR(dev), ino, &len); +			MAJOR(dev), MINOR(dev), ino);  	/*  	 * Print the dentry name for named mappings, and a  	 * special [heap] marker for the heap:  	 */  	if (file) { -		pad_len_spaces(m, len); +		seq_pad(m, ' ');  		seq_path(m, &file->f_path, "\n"); -	} else { -		const char *name = arch_vma_name(vma); -		if (!name) { -			if (mm) { -				if (vma->vm_start <= mm->start_brk && -						vma->vm_end >= mm->brk) { -					name = "[heap]"; -				} else if (vma->vm_start <= mm->start_stack && -					   vma->vm_end >= mm->start_stack) { -					name = "[stack]"; -				} +		goto done; +	} + +	if (vma->vm_ops && vma->vm_ops->name) { +		name = vma->vm_ops->name(vma); +		if (name) +			goto done; +	} + +	name = arch_vma_name(vma); +	if (!name) { +		pid_t tid; + +		if (!mm) { +			name = "[vdso]"; +			goto done; +		} + +		if (vma->vm_start <= mm->brk && +		    vma->vm_end >= mm->start_brk) { +			name = "[heap]"; +			goto done; +		} + +		tid = vm_is_stack(task, vma, is_pid); + +		if (tid != 0) { +			/* +			 * Thread stack in /proc/PID/task/TID/maps or +			 * the main process stack. +			 */ +			if (!is_pid || (vma->vm_start <= mm->start_stack && +			    vma->vm_end >= mm->start_stack)) { +				name = "[stack]";  			} else { -				name = "[vdso]"; +				/* Thread stack in /proc/PID/maps */ +				seq_pad(m, ' '); +				seq_printf(m, "[stack:%d]", tid);  			}  		} -		if (name) { -			pad_len_spaces(m, len); -			seq_puts(m, name); -		} +	} + +done: +	if (name) { +		seq_pad(m, ' '); +		seq_puts(m, name);  	}  	seq_putc(m, '\n');  } -static int show_map(struct seq_file *m, void *v) +static int show_map(struct seq_file *m, void *v, int is_pid)  {  	struct vm_area_struct *vma = v;  	struct proc_maps_private *priv = m->private;  	struct task_struct *task = priv->task; -	show_map_vma(m, vma); +	show_map_vma(m, vma, is_pid);  	if (m->count < m->size)  /* vma is copied successfully */ -		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; +		m->version = (vma != get_gate_vma(task->mm)) +			? vma->vm_start : 0;  	return 0;  } +static int show_pid_map(struct seq_file *m, void *v) +{ +	return show_map(m, v, 1); +} + +static int show_tid_map(struct seq_file *m, void *v) +{ +	return show_map(m, v, 0); +} +  static const struct seq_operations proc_pid_maps_op = {  	.start	= m_start,  	.next	= m_next,  	.stop	= m_stop, -	.show	= show_map +	.show	= show_pid_map  }; -static int maps_open(struct inode *inode, struct file *file) +static const struct seq_operations proc_tid_maps_op = { +	.start	= m_start, +	.next	= m_next, +	.stop	= m_stop, +	.show	= show_tid_map +}; + +static int pid_maps_open(struct inode *inode, struct file *file)  {  	return do_maps_open(inode, file, &proc_pid_maps_op);  } -const struct file_operations proc_maps_operations = { -	.open		= maps_open, +static int tid_maps_open(struct inode *inode, struct file *file) +{ +	return do_maps_open(inode, file, &proc_tid_maps_op); +} + +const struct file_operations proc_pid_maps_operations = { +	.open		= pid_maps_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release_private, +}; + +const struct file_operations proc_tid_maps_operations = { +	.open		= tid_maps_open,  	.read		= seq_read,  	.llseek		= seq_lseek,  	.release	= seq_release_private, @@ -328,64 +438,150 @@ struct mem_size_stats {  	unsigned long private_dirty;  	unsigned long referenced;  	unsigned long anonymous; +	unsigned long anonymous_thp;  	unsigned long swap; +	unsigned long nonlinear;  	u64 pss;  }; -static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, -			   struct mm_walk *walk) + +static void smaps_pte_entry(pte_t ptent, unsigned long addr, +		unsigned long ptent_size, struct mm_walk *walk)  {  	struct mem_size_stats *mss = walk->private;  	struct vm_area_struct *vma = mss->vma; -	pte_t *pte, ptent; -	spinlock_t *ptl; -	struct page *page; +	pgoff_t pgoff = linear_page_index(vma, addr); +	struct page *page = NULL;  	int mapcount; -	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); -	for (; addr != end; pte++, addr += PAGE_SIZE) { -		ptent = *pte; - -		if (is_swap_pte(ptent)) { -			mss->swap += PAGE_SIZE; -			continue; -		} +	if (pte_present(ptent)) { +		page = vm_normal_page(vma, addr, ptent); +	} else if (is_swap_pte(ptent)) { +		swp_entry_t swpent = pte_to_swp_entry(ptent); + +		if (!non_swap_entry(swpent)) +			mss->swap += ptent_size; +		else if (is_migration_entry(swpent)) +			page = migration_entry_to_page(swpent); +	} else if (pte_file(ptent)) { +		if (pte_to_pgoff(ptent) != pgoff) +			mss->nonlinear += ptent_size; +	} -		if (!pte_present(ptent)) -			continue; +	if (!page) +		return; + +	if (PageAnon(page)) +		mss->anonymous += ptent_size; + +	if (page->index != pgoff) +		mss->nonlinear += ptent_size; + +	mss->resident += ptent_size; +	/* Accumulate the size in pages that have been accessed. */ +	if (pte_young(ptent) || PageReferenced(page)) +		mss->referenced += ptent_size; +	mapcount = page_mapcount(page); +	if (mapcount >= 2) { +		if (pte_dirty(ptent) || PageDirty(page)) +			mss->shared_dirty += ptent_size; +		else +			mss->shared_clean += ptent_size; +		mss->pss += (ptent_size << PSS_SHIFT) / mapcount; +	} else { +		if (pte_dirty(ptent) || PageDirty(page)) +			mss->private_dirty += ptent_size; +		else +			mss->private_clean += ptent_size; +		mss->pss += (ptent_size << PSS_SHIFT); +	} +} -		page = vm_normal_page(vma, addr, ptent); -		if (!page) -			continue; +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, +			   struct mm_walk *walk) +{ +	struct mem_size_stats *mss = walk->private; +	struct vm_area_struct *vma = mss->vma; +	pte_t *pte; +	spinlock_t *ptl; -		if (PageAnon(page)) -			mss->anonymous += PAGE_SIZE; - -		mss->resident += PAGE_SIZE; -		/* Accumulate the size in pages that have been accessed. */ -		if (pte_young(ptent) || PageReferenced(page)) -			mss->referenced += PAGE_SIZE; -		mapcount = page_mapcount(page); -		if (mapcount >= 2) { -			if (pte_dirty(ptent) || PageDirty(page)) -				mss->shared_dirty += PAGE_SIZE; -			else -				mss->shared_clean += PAGE_SIZE; -			mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; -		} else { -			if (pte_dirty(ptent) || PageDirty(page)) -				mss->private_dirty += PAGE_SIZE; -			else -				mss->private_clean += PAGE_SIZE; -			mss->pss += (PAGE_SIZE << PSS_SHIFT); -		} +	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { +		smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); +		spin_unlock(ptl); +		mss->anonymous_thp += HPAGE_PMD_SIZE; +		return 0;  	} + +	if (pmd_trans_unstable(pmd)) +		return 0; +	/* +	 * The mmap_sem held all the way back in m_start() is what +	 * keeps khugepaged out of here and from collapsing things +	 * in here. +	 */ +	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); +	for (; addr != end; pte++, addr += PAGE_SIZE) +		smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);  	pte_unmap_unlock(pte - 1, ptl);  	cond_resched();  	return 0;  } -static int show_smap(struct seq_file *m, void *v) +static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) +{ +	/* +	 * Don't forget to update Documentation/ on changes. +	 */ +	static const char mnemonics[BITS_PER_LONG][2] = { +		/* +		 * In case if we meet a flag we don't know about. +		 */ +		[0 ... (BITS_PER_LONG-1)] = "??", + +		[ilog2(VM_READ)]	= "rd", +		[ilog2(VM_WRITE)]	= "wr", +		[ilog2(VM_EXEC)]	= "ex", +		[ilog2(VM_SHARED)]	= "sh", +		[ilog2(VM_MAYREAD)]	= "mr", +		[ilog2(VM_MAYWRITE)]	= "mw", +		[ilog2(VM_MAYEXEC)]	= "me", +		[ilog2(VM_MAYSHARE)]	= "ms", +		[ilog2(VM_GROWSDOWN)]	= "gd", +		[ilog2(VM_PFNMAP)]	= "pf", +		[ilog2(VM_DENYWRITE)]	= "dw", +		[ilog2(VM_LOCKED)]	= "lo", +		[ilog2(VM_IO)]		= "io", +		[ilog2(VM_SEQ_READ)]	= "sr", +		[ilog2(VM_RAND_READ)]	= "rr", +		[ilog2(VM_DONTCOPY)]	= "dc", +		[ilog2(VM_DONTEXPAND)]	= "de", +		[ilog2(VM_ACCOUNT)]	= "ac", +		[ilog2(VM_NORESERVE)]	= "nr", +		[ilog2(VM_HUGETLB)]	= "ht", +		[ilog2(VM_NONLINEAR)]	= "nl", +		[ilog2(VM_ARCH_1)]	= "ar", +		[ilog2(VM_DONTDUMP)]	= "dd", +#ifdef CONFIG_MEM_SOFT_DIRTY +		[ilog2(VM_SOFTDIRTY)]	= "sd", +#endif +		[ilog2(VM_MIXEDMAP)]	= "mm", +		[ilog2(VM_HUGEPAGE)]	= "hg", +		[ilog2(VM_NOHUGEPAGE)]	= "nh", +		[ilog2(VM_MERGEABLE)]	= "mg", +	}; +	size_t i; + +	seq_puts(m, "VmFlags: "); +	for (i = 0; i < BITS_PER_LONG; i++) { +		if (vma->vm_flags & (1UL << i)) { +			seq_printf(m, "%c%c ", +				   mnemonics[i][0], mnemonics[i][1]); +		} +	} +	seq_putc(m, '\n'); +} + +static int show_smap(struct seq_file *m, void *v, int is_pid)  {  	struct proc_maps_private *priv = m->private;  	struct task_struct *task = priv->task; @@ -403,7 +599,7 @@ static int show_smap(struct seq_file *m, void *v)  	if (vma->vm_mm && !is_vm_hugetlb_page(vma))  		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); -	show_map_vma(m, vma); +	show_map_vma(m, vma, is_pid);  	seq_printf(m,  		   "Size:           %8lu kB\n" @@ -415,9 +611,11 @@ static int show_smap(struct seq_file *m, void *v)  		   "Private_Dirty:  %8lu kB\n"  		   "Referenced:     %8lu kB\n"  		   "Anonymous:      %8lu kB\n" +		   "AnonHugePages:  %8lu kB\n"  		   "Swap:           %8lu kB\n"  		   "KernelPageSize: %8lu kB\n" -		   "MMUPageSize:    %8lu kB\n", +		   "MMUPageSize:    %8lu kB\n" +		   "Locked:         %8lu kB\n",  		   (vma->vm_end - vma->vm_start) >> 10,  		   mss.resident >> 10,  		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -427,45 +625,150 @@ static int show_smap(struct seq_file *m, void *v)  		   mss.private_dirty >> 10,  		   mss.referenced >> 10,  		   mss.anonymous >> 10, +		   mss.anonymous_thp >> 10,  		   mss.swap >> 10,  		   vma_kernel_pagesize(vma) >> 10, -		   vma_mmu_pagesize(vma) >> 10); +		   vma_mmu_pagesize(vma) >> 10, +		   (vma->vm_flags & VM_LOCKED) ? +			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); + +	if (vma->vm_flags & VM_NONLINEAR) +		seq_printf(m, "Nonlinear:      %8lu kB\n", +				mss.nonlinear >> 10); + +	show_smap_vma_flags(m, vma);  	if (m->count < m->size)  /* vma is copied successfully */ -		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; +		m->version = (vma != get_gate_vma(task->mm)) +			? vma->vm_start : 0;  	return 0;  } +static int show_pid_smap(struct seq_file *m, void *v) +{ +	return show_smap(m, v, 1); +} + +static int show_tid_smap(struct seq_file *m, void *v) +{ +	return show_smap(m, v, 0); +} +  static const struct seq_operations proc_pid_smaps_op = {  	.start	= m_start,  	.next	= m_next,  	.stop	= m_stop, -	.show	= show_smap +	.show	= show_pid_smap +}; + +static const struct seq_operations proc_tid_smaps_op = { +	.start	= m_start, +	.next	= m_next, +	.stop	= m_stop, +	.show	= show_tid_smap  }; -static int smaps_open(struct inode *inode, struct file *file) +static int pid_smaps_open(struct inode *inode, struct file *file)  {  	return do_maps_open(inode, file, &proc_pid_smaps_op);  } -const struct file_operations proc_smaps_operations = { -	.open		= smaps_open, +static int tid_smaps_open(struct inode *inode, struct file *file) +{ +	return do_maps_open(inode, file, &proc_tid_smaps_op); +} + +const struct file_operations proc_pid_smaps_operations = { +	.open		= pid_smaps_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release_private, +}; + +const struct file_operations proc_tid_smaps_operations = { +	.open		= tid_smaps_open,  	.read		= seq_read,  	.llseek		= seq_lseek,  	.release	= seq_release_private,  }; +/* + * We do not want to have constant page-shift bits sitting in + * pagemap entries and are about to reuse them some time soon. + * + * Here's the "migration strategy": + * 1. when the system boots these bits remain what they are, + *    but a warning about future change is printed in log; + * 2. once anyone clears soft-dirty bits via clear_refs file, + *    these flag is set to denote, that user is aware of the + *    new API and those page-shift bits change their meaning. + *    The respective warning is printed in dmesg; + * 3. In a couple of releases we will remove all the mentions + *    of page-shift in pagemap entries. + */ + +static bool soft_dirty_cleared __read_mostly; + +enum clear_refs_types { +	CLEAR_REFS_ALL = 1, +	CLEAR_REFS_ANON, +	CLEAR_REFS_MAPPED, +	CLEAR_REFS_SOFT_DIRTY, +	CLEAR_REFS_LAST, +}; + +struct clear_refs_private { +	struct vm_area_struct *vma; +	enum clear_refs_types type; +}; + +static inline void clear_soft_dirty(struct vm_area_struct *vma, +		unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY +	/* +	 * The soft-dirty tracker uses #PF-s to catch writes +	 * to pages, so write-protect the pte as well. See the +	 * Documentation/vm/soft-dirty.txt for full description +	 * of how soft-dirty works. +	 */ +	pte_t ptent = *pte; + +	if (pte_present(ptent)) { +		ptent = pte_wrprotect(ptent); +		ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); +	} else if (is_swap_pte(ptent)) { +		ptent = pte_swp_clear_soft_dirty(ptent); +	} else if (pte_file(ptent)) { +		ptent = pte_file_clear_soft_dirty(ptent); +	} + +	set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} +  static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,  				unsigned long end, struct mm_walk *walk)  { -	struct vm_area_struct *vma = walk->private; +	struct clear_refs_private *cp = walk->private; +	struct vm_area_struct *vma = cp->vma;  	pte_t *pte, ptent;  	spinlock_t *ptl;  	struct page *page; +	split_huge_page_pmd(vma, addr, pmd); +	if (pmd_trans_unstable(pmd)) +		return 0; +  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);  	for (; addr != end; pte++, addr += PAGE_SIZE) {  		ptent = *pte; + +		if (cp->type == CLEAR_REFS_SOFT_DIRTY) { +			clear_soft_dirty(vma, addr, pte); +			continue; +		} +  		if (!pte_present(ptent))  			continue; @@ -482,10 +785,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,  	return 0;  } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 -  static ssize_t clear_refs_write(struct file *file, const char __user *buf,  				size_t count, loff_t *ppos)  { @@ -493,29 +792,47 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,  	char buffer[PROC_NUMBUF];  	struct mm_struct *mm;  	struct vm_area_struct *vma; -	long type; +	enum clear_refs_types type; +	int itype; +	int rv;  	memset(buffer, 0, sizeof(buffer));  	if (count > sizeof(buffer) - 1)  		count = sizeof(buffer) - 1;  	if (copy_from_user(buffer, buf, count))  		return -EFAULT; -	if (strict_strtol(strstrip(buffer), 10, &type)) -		return -EINVAL; -	if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) +	rv = kstrtoint(strstrip(buffer), 10, &itype); +	if (rv < 0) +		return rv; +	type = (enum clear_refs_types)itype; +	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)  		return -EINVAL; -	task = get_proc_task(file->f_path.dentry->d_inode); + +	if (type == CLEAR_REFS_SOFT_DIRTY) { +		soft_dirty_cleared = true; +		pr_warn_once("The pagemap bits 55-60 has changed their meaning!" +			     " See the linux/Documentation/vm/pagemap.txt for " +			     "details.\n"); +	} + +	task = get_proc_task(file_inode(file));  	if (!task)  		return -ESRCH;  	mm = get_task_mm(task);  	if (mm) { +		struct clear_refs_private cp = { +			.type = type, +		};  		struct mm_walk clear_refs_walk = {  			.pmd_entry = clear_refs_pte_range,  			.mm = mm, +			.private = &cp,  		};  		down_read(&mm->mmap_sem); +		if (type == CLEAR_REFS_SOFT_DIRTY) +			mmu_notifier_invalidate_range_start(mm, 0, -1);  		for (vma = mm->mmap; vma; vma = vma->vm_next) { -			clear_refs_walk.private = vma; +			cp.vma = vma;  			if (is_vm_hugetlb_page(vma))  				continue;  			/* @@ -526,14 +843,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,  			 *  			 * Writing 3 to /proc/pid/clear_refs only affects file  			 * mapped pages. +			 * +			 * Writing 4 to /proc/pid/clear_refs affects all pages.  			 */  			if (type == CLEAR_REFS_ANON && vma->vm_file)  				continue;  			if (type == CLEAR_REFS_MAPPED && !vma->vm_file)  				continue; +			if (type == CLEAR_REFS_SOFT_DIRTY) { +				if (vma->vm_flags & VM_SOFTDIRTY) +					vma->vm_flags &= ~VM_SOFTDIRTY; +			}  			walk_page_range(vma->vm_start, vma->vm_end,  					&clear_refs_walk);  		} +		if (type == CLEAR_REFS_SOFT_DIRTY) +			mmu_notifier_invalidate_range_end(mm, 0, -1);  		flush_tlb_mm(mm);  		up_read(&mm->mmap_sem);  		mmput(mm); @@ -548,12 +873,20 @@ const struct file_operations proc_clear_refs_operations = {  	.llseek		= noop_llseek,  }; +typedef struct { +	u64 pme; +} pagemap_entry_t; +  struct pagemapread { -	int pos, len; -	u64 *buffer; +	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */ +	pagemap_entry_t *buffer; +	bool v2;  }; -#define PM_ENTRY_BYTES      sizeof(u64) +#define PAGEMAP_WALK_SIZE	(PMD_SIZE) +#define PAGEMAP_WALK_MASK	(PMD_MASK) + +#define PM_ENTRY_BYTES      sizeof(pagemap_entry_t)  #define PM_STATUS_BITS      3  #define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)  #define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) @@ -561,19 +894,28 @@ struct pagemapread {  #define PM_PSHIFT_BITS      6  #define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)  #define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x)      (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)  #define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)  #define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK) +/* in "new" pagemap pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x)   (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY      (1LL)  #define PM_PRESENT          PM_STATUS(4LL)  #define PM_SWAP             PM_STATUS(2LL) -#define PM_NOT_PRESENT      PM_PSHIFT(PAGE_SHIFT) +#define PM_FILE             PM_STATUS(1LL) +#define PM_NOT_PRESENT(v2)  PM_STATUS2(v2, 0)  #define PM_END_OF_BUFFER    1 -static int add_to_pagemap(unsigned long addr, u64 pfn, +static inline pagemap_entry_t make_pme(u64 val) +{ +	return (pagemap_entry_t) { .pme = val }; +} + +static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,  			  struct pagemapread *pm)  { -	pm->buffer[pm->pos++] = pfn; +	pm->buffer[pm->pos++] = *pme;  	if (pm->pos >= pm->len)  		return PM_END_OF_BUFFER;  	return 0; @@ -585,60 +927,136 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,  	struct pagemapread *pm = walk->private;  	unsigned long addr;  	int err = 0; +	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); +  	for (addr = start; addr < end; addr += PAGE_SIZE) { -		err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); +		err = add_to_pagemap(addr, &pme, pm);  		if (err)  			break;  	}  	return err;  } -static u64 swap_pte_to_pagemap_entry(pte_t pte) +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, +		struct vm_area_struct *vma, unsigned long addr, pte_t pte)  { -	swp_entry_t e = pte_to_swp_entry(pte); -	return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); +	u64 frame, flags; +	struct page *page = NULL; +	int flags2 = 0; + +	if (pte_present(pte)) { +		frame = pte_pfn(pte); +		flags = PM_PRESENT; +		page = vm_normal_page(vma, addr, pte); +		if (pte_soft_dirty(pte)) +			flags2 |= __PM_SOFT_DIRTY; +	} else if (is_swap_pte(pte)) { +		swp_entry_t entry; +		if (pte_swp_soft_dirty(pte)) +			flags2 |= __PM_SOFT_DIRTY; +		entry = pte_to_swp_entry(pte); +		frame = swp_type(entry) | +			(swp_offset(entry) << MAX_SWAPFILES_SHIFT); +		flags = PM_SWAP; +		if (is_migration_entry(entry)) +			page = migration_entry_to_page(entry); +	} else { +		if (vma->vm_flags & VM_SOFTDIRTY) +			flags2 |= __PM_SOFT_DIRTY; +		*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); +		return; +	} + +	if (page && !PageAnon(page)) +		flags |= PM_FILE; +	if ((vma->vm_flags & VM_SOFTDIRTY)) +		flags2 |= __PM_SOFT_DIRTY; + +	*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);  } -static u64 pte_to_pagemap_entry(pte_t pte) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, +		pmd_t pmd, int offset, int pmd_flags2) +{ +	/* +	 * Currently pmd for thp is always present because thp can not be +	 * swapped-out, migrated, or HWPOISONed (split in such cases instead.) +	 * This if-check is just to prepare for future implementation. +	 */ +	if (pmd_present(pmd)) +		*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) +				| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); +	else +		*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); +} +#else +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, +		pmd_t pmd, int offset, int pmd_flags2)  { -	u64 pme = 0; -	if (is_swap_pte(pte)) -		pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte)) -			| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; -	else if (pte_present(pte)) -		pme = PM_PFRAME(pte_pfn(pte)) -			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; -	return pme;  } +#endif  static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  			     struct mm_walk *walk)  {  	struct vm_area_struct *vma;  	struct pagemapread *pm = walk->private; +	spinlock_t *ptl;  	pte_t *pte;  	int err = 0; +	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));  	/* find the first VMA at or above 'addr' */  	vma = find_vma(walk->mm, addr); +	if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { +		int pmd_flags2; + +		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) +			pmd_flags2 = __PM_SOFT_DIRTY; +		else +			pmd_flags2 = 0; + +		for (; addr != end; addr += PAGE_SIZE) { +			unsigned long offset; + +			offset = (addr & ~PAGEMAP_WALK_MASK) >> +					PAGE_SHIFT; +			thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); +			err = add_to_pagemap(addr, &pme, pm); +			if (err) +				break; +		} +		spin_unlock(ptl); +		return err; +	} + +	if (pmd_trans_unstable(pmd)) +		return 0;  	for (; addr != end; addr += PAGE_SIZE) { -		u64 pfn = PM_NOT_PRESENT; +		int flags2;  		/* check to see if we've left 'vma' behind  		 * and need a new, higher one */ -		if (vma && (addr >= vma->vm_end)) +		if (vma && (addr >= vma->vm_end)) {  			vma = find_vma(walk->mm, addr); +			if (vma && (vma->vm_flags & VM_SOFTDIRTY)) +				flags2 = __PM_SOFT_DIRTY; +			else +				flags2 = 0; +			pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); +		}  		/* check that 'vma' actually covers this address,  		 * and that it isn't a huge page vma */  		if (vma && (vma->vm_start <= addr) &&  		    !is_vm_hugetlb_page(vma)) {  			pte = pte_offset_map(pmd, addr); -			pfn = pte_to_pagemap_entry(*pte); +			pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);  			/* unmap before userspace copy */  			pte_unmap(pte);  		} -		err = add_to_pagemap(addr, pfn, pm); +		err = add_to_pagemap(addr, &pme, pm);  		if (err)  			return err;  	} @@ -649,13 +1067,16 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  }  #ifdef CONFIG_HUGETLB_PAGE -static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, +					pte_t pte, int offset, int flags2)  { -	u64 pme = 0;  	if (pte_present(pte)) -		pme = PM_PFRAME(pte_pfn(pte) + offset) -			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; -	return pme; +		*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)	| +				PM_STATUS2(pm->v2, flags2)		| +				PM_PRESENT); +	else +		*pme = make_pme(PM_NOT_PRESENT(pm->v2)			| +				PM_STATUS2(pm->v2, flags2));  }  /* This function walks within one hugetlb entry in the single call */ @@ -664,13 +1085,23 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,  				 struct mm_walk *walk)  {  	struct pagemapread *pm = walk->private; +	struct vm_area_struct *vma;  	int err = 0; -	u64 pfn; +	int flags2; +	pagemap_entry_t pme; + +	vma = find_vma(walk->mm, addr); +	WARN_ON_ONCE(!vma); + +	if (vma && (vma->vm_flags & VM_SOFTDIRTY)) +		flags2 = __PM_SOFT_DIRTY; +	else +		flags2 = 0;  	for (; addr != end; addr += PAGE_SIZE) {  		int offset = (addr & ~hmask) >> PAGE_SHIFT; -		pfn = huge_pte_to_pagemap_entry(*pte, offset); -		err = add_to_pagemap(addr, pfn, pm); +		huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); +		err = add_to_pagemap(addr, &pme, pm);  		if (err)  			return err;  	} @@ -687,11 +1118,11 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,   * For each page in the address space, this file contains one 64-bit entry   * consisting of the following:   * - * Bits 0-55  page frame number (PFN) if present + * Bits 0-54  page frame number (PFN) if present   * Bits 0-4   swap type if swapped - * Bits 5-55  swap offset if swapped + * Bits 5-54  swap offset if swapped   * Bits 55-60 page shift (page size = 1<<page shift) - * Bit  61    reserved for future use + * Bit  61    page is file-page or shared-anon   * Bit  62    page swapped   * Bit  63    page present   * @@ -705,11 +1136,10 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,   * determine which areas of memory are actually mapped and llseek to   * skip over unmapped regions.   */ -#define PAGEMAP_WALK_SIZE	(PMD_SIZE)  static ssize_t pagemap_read(struct file *file, char __user *buf,  			    size_t count, loff_t *ppos)  { -	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); +	struct task_struct *task = get_proc_task(file_inode(file));  	struct mm_struct *mm;  	struct pagemapread pm;  	int ret = -ESRCH; @@ -723,29 +1153,26 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,  	if (!task)  		goto out; -	ret = -EACCES; -	if (!ptrace_may_access(task, PTRACE_MODE_READ)) -		goto out_task; -  	ret = -EINVAL;  	/* file position must be aligned */  	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))  		goto out_task;  	ret = 0; -  	if (!count)  		goto out_task; -	mm = get_task_mm(task); -	if (!mm) -		goto out_task; - -	pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); -	pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); +	pm.v2 = soft_dirty_cleared; +	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); +	pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);  	ret = -ENOMEM;  	if (!pm.buffer) -		goto out_mm; +		goto out_task; + +	mm = mm_access(task, PTRACE_MODE_READ); +	ret = PTR_ERR(mm); +	if (!mm || IS_ERR(mm)) +		goto out_free;  	pagemap_walk.pmd_entry = pagemap_pte_range;  	pagemap_walk.pte_hole = pagemap_pte_hole; @@ -776,7 +1203,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,  		unsigned long end;  		pm.pos = 0; -		end = start_vaddr + PAGEMAP_WALK_SIZE; +		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;  		/* overflow ? */  		if (end < start_vaddr || end > end_vaddr)  			end = end_vaddr; @@ -788,7 +1215,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,  		len = min(count, PM_ENTRY_BYTES * pm.pos);  		if (copy_to_user(buf, pm.buffer, len)) {  			ret = -EFAULT; -			goto out_free; +			goto out_mm;  		}  		copied += len;  		buf += len; @@ -798,41 +1225,322 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,  	if (!ret || ret == PM_END_OF_BUFFER)  		ret = copied; -out_free: -	kfree(pm.buffer);  out_mm:  	mmput(mm); +out_free: +	kfree(pm.buffer);  out_task:  	put_task_struct(task);  out:  	return ret;  } +static int pagemap_open(struct inode *inode, struct file *file) +{ +	pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " +			"to stop being page-shift some time soon. See the " +			"linux/Documentation/vm/pagemap.txt for details.\n"); +	return 0; +} +  const struct file_operations proc_pagemap_operations = {  	.llseek		= mem_lseek, /* borrow this */  	.read		= pagemap_read, +	.open		= pagemap_open,  };  #endif /* CONFIG_PROC_PAGE_MONITOR */  #ifdef CONFIG_NUMA -extern int show_numa_map(struct seq_file *m, void *v); + +struct numa_maps { +	struct vm_area_struct *vma; +	unsigned long pages; +	unsigned long anon; +	unsigned long active; +	unsigned long writeback; +	unsigned long mapcount_max; +	unsigned long dirty; +	unsigned long swapcache; +	unsigned long node[MAX_NUMNODES]; +}; + +struct numa_maps_private { +	struct proc_maps_private proc_maps; +	struct numa_maps md; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, +			unsigned long nr_pages) +{ +	int count = page_mapcount(page); + +	md->pages += nr_pages; +	if (pte_dirty || PageDirty(page)) +		md->dirty += nr_pages; + +	if (PageSwapCache(page)) +		md->swapcache += nr_pages; + +	if (PageActive(page) || PageUnevictable(page)) +		md->active += nr_pages; + +	if (PageWriteback(page)) +		md->writeback += nr_pages; + +	if (PageAnon(page)) +		md->anon += nr_pages; + +	if (count > md->mapcount_max) +		md->mapcount_max = count; + +	md->node[page_to_nid(page)] += nr_pages; +} + +static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, +		unsigned long addr) +{ +	struct page *page; +	int nid; + +	if (!pte_present(pte)) +		return NULL; + +	page = vm_normal_page(vma, addr, pte); +	if (!page) +		return NULL; + +	if (PageReserved(page)) +		return NULL; + +	nid = page_to_nid(page); +	if (!node_isset(nid, node_states[N_MEMORY])) +		return NULL; + +	return page; +} + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, +		unsigned long end, struct mm_walk *walk) +{ +	struct numa_maps *md; +	spinlock_t *ptl; +	pte_t *orig_pte; +	pte_t *pte; + +	md = walk->private; + +	if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { +		pte_t huge_pte = *(pte_t *)pmd; +		struct page *page; + +		page = can_gather_numa_stats(huge_pte, md->vma, addr); +		if (page) +			gather_stats(page, md, pte_dirty(huge_pte), +				     HPAGE_PMD_SIZE/PAGE_SIZE); +		spin_unlock(ptl); +		return 0; +	} + +	if (pmd_trans_unstable(pmd)) +		return 0; +	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); +	do { +		struct page *page = can_gather_numa_stats(*pte, md->vma, addr); +		if (!page) +			continue; +		gather_stats(page, md, pte_dirty(*pte), 1); + +	} while (pte++, addr += PAGE_SIZE, addr != end); +	pte_unmap_unlock(orig_pte, ptl); +	return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, +		unsigned long addr, unsigned long end, struct mm_walk *walk) +{ +	struct numa_maps *md; +	struct page *page; + +	if (!pte_present(*pte)) +		return 0; + +	page = pte_page(*pte); +	if (!page) +		return 0; + +	md = walk->private; +	gather_stats(page, md, pte_dirty(*pte), 1); +	return 0; +} + +#else +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, +		unsigned long addr, unsigned long end, struct mm_walk *walk) +{ +	return 0; +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v, int is_pid) +{ +	struct numa_maps_private *numa_priv = m->private; +	struct proc_maps_private *proc_priv = &numa_priv->proc_maps; +	struct vm_area_struct *vma = v; +	struct numa_maps *md = &numa_priv->md; +	struct file *file = vma->vm_file; +	struct task_struct *task = proc_priv->task; +	struct mm_struct *mm = vma->vm_mm; +	struct mm_walk walk = {}; +	struct mempolicy *pol; +	char buffer[64]; +	int nid; + +	if (!mm) +		return 0; + +	/* Ensure we start with an empty set of numa_maps statistics. */ +	memset(md, 0, sizeof(*md)); + +	md->vma = vma; + +	walk.hugetlb_entry = gather_hugetbl_stats; +	walk.pmd_entry = gather_pte_stats; +	walk.private = md; +	walk.mm = mm; + +	pol = get_vma_policy(task, vma, vma->vm_start); +	mpol_to_str(buffer, sizeof(buffer), pol); +	mpol_cond_put(pol); + +	seq_printf(m, "%08lx %s", vma->vm_start, buffer); + +	if (file) { +		seq_puts(m, " file="); +		seq_path(m, &file->f_path, "\n\t= "); +	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { +		seq_puts(m, " heap"); +	} else { +		pid_t tid = vm_is_stack(task, vma, is_pid); +		if (tid != 0) { +			/* +			 * Thread stack in /proc/PID/task/TID/maps or +			 * the main process stack. +			 */ +			if (!is_pid || (vma->vm_start <= mm->start_stack && +			    vma->vm_end >= mm->start_stack)) +				seq_puts(m, " stack"); +			else +				seq_printf(m, " stack:%d", tid); +		} +	} + +	if (is_vm_hugetlb_page(vma)) +		seq_puts(m, " huge"); + +	walk_page_range(vma->vm_start, vma->vm_end, &walk); + +	if (!md->pages) +		goto out; + +	if (md->anon) +		seq_printf(m, " anon=%lu", md->anon); + +	if (md->dirty) +		seq_printf(m, " dirty=%lu", md->dirty); + +	if (md->pages != md->anon && md->pages != md->dirty) +		seq_printf(m, " mapped=%lu", md->pages); + +	if (md->mapcount_max > 1) +		seq_printf(m, " mapmax=%lu", md->mapcount_max); + +	if (md->swapcache) +		seq_printf(m, " swapcache=%lu", md->swapcache); + +	if (md->active < md->pages && !is_vm_hugetlb_page(vma)) +		seq_printf(m, " active=%lu", md->active); + +	if (md->writeback) +		seq_printf(m, " writeback=%lu", md->writeback); + +	for_each_node_state(nid, N_MEMORY) +		if (md->node[nid]) +			seq_printf(m, " N%d=%lu", nid, md->node[nid]); +out: +	seq_putc(m, '\n'); + +	if (m->count < m->size) +		m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; +	return 0; +} + +static int show_pid_numa_map(struct seq_file *m, void *v) +{ +	return show_numa_map(m, v, 1); +} + +static int show_tid_numa_map(struct seq_file *m, void *v) +{ +	return show_numa_map(m, v, 0); +}  static const struct seq_operations proc_pid_numa_maps_op = { -        .start  = m_start, -        .next   = m_next, -        .stop   = m_stop, -        .show   = show_numa_map, +	.start  = m_start, +	.next   = m_next, +	.stop   = m_stop, +	.show   = show_pid_numa_map, +}; + +static const struct seq_operations proc_tid_numa_maps_op = { +	.start  = m_start, +	.next   = m_next, +	.stop   = m_stop, +	.show   = show_tid_numa_map,  }; -static int numa_maps_open(struct inode *inode, struct file *file) +static int numa_maps_open(struct inode *inode, struct file *file, +			  const struct seq_operations *ops)  { -	return do_maps_open(inode, file, &proc_pid_numa_maps_op); +	struct numa_maps_private *priv; +	int ret = -ENOMEM; +	priv = kzalloc(sizeof(*priv), GFP_KERNEL); +	if (priv) { +		priv->proc_maps.pid = proc_pid(inode); +		ret = seq_open(file, ops); +		if (!ret) { +			struct seq_file *m = file->private_data; +			m->private = priv; +		} else { +			kfree(priv); +		} +	} +	return ret; +} + +static int pid_numa_maps_open(struct inode *inode, struct file *file) +{ +	return numa_maps_open(inode, file, &proc_pid_numa_maps_op);  } -const struct file_operations proc_numa_maps_operations = { -	.open		= numa_maps_open, +static int tid_numa_maps_open(struct inode *inode, struct file *file) +{ +	return numa_maps_open(inode, file, &proc_tid_numa_maps_op); +} + +const struct file_operations proc_pid_numa_maps_operations = { +	.open		= pid_numa_maps_open,  	.read		= seq_read,  	.llseek		= seq_lseek,  	.release	= seq_release_private,  }; -#endif + +const struct file_operations proc_tid_numa_maps_operations = { +	.open		= tid_numa_maps_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release_private, +}; +#endif /* CONFIG_NUMA */  | 
